def test_join_topk(): """Tests a top k with a join :return: None """ limit = 5 query_plan = QueryPlan() # Query plan ts1 = query_plan.add_operator(SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts1', query_plan, False)) ts1_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'ts1_project', query_plan, False)) ts2 = query_plan.add_operator(SQLTableScan('nation.csv', 'select * from S3Object;', False, 'ts2', query_plan, False)) ts2_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'ts2_project', query_plan, False)) j = query_plan.add_operator(HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'j', query_plan, False)) t = query_plan.add_operator(Limit(limit, 't', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts1.connect(ts1_project) ts2.connect(ts2_project) j.connect_left_producer(ts1_project) j.connect_right_producer(ts2_project) j.connect(t) t.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['s_nationkey', 'n_nationkey'] assert len(c.tuples()) == limit + 1 assert c.tuples()[0] == field_names num_rows = 0 for t in c.tuples(): num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['s_nationkey'] == lt['n_nationkey'] # Write the metrics query_plan.print_metrics()
def test_scan_empty(): """Executes a scan where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from s3object limit 0", False, 'ts', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def test_project_perf(): """Executes a projection over many source rows to examine performance. :return: None """ num_rows = 10000 profile_file_name = os.path.join( ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof") query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan random_col_defs = [ RandomIntColumnDef(0, 9), RandomStringColumnDef(10, 20), RandomDateColumnDef(datetime.strptime('2017-01-01', '%Y-%m-%d'), datetime.strptime('2018-01-01', '%Y-%m-%d')) ] random_table_scan = query_plan.add_operator( RandomTableScan(num_rows, random_col_defs, 'random_table_scan', query_plan, False)) project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'r_0'), ProjectExpression(lambda t_: t_['_1'], 'r_1'), ProjectExpression(lambda t_: t_['_2'], 'r_2') ], 'project', query_plan, False)) project.set_profiled(True, profile_file_name) collate = query_plan.add_operator(Collate('collate', query_plan, False)) random_table_scan.connect(project) project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() query_plan.stop() # Write the metrics s = pstats.Stats(profile_file_name) s.strip_dirs().sort_stats("time").print_stats() assert len(tuples) == num_rows + 1
def test_pandas_project_simple(): """Tests a projection :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'limit 3;', True, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['n_regionkey', 'n_nationkey', 'n_comment'] assert len(c.tuples()) == 3 + 1 assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '0', '0', ' haggle. carefully final deposits detect slyly agai' ] assert c.tuples()[2] == [ '1', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert c.tuples()[3] == [ '1', '2', 'y alongside of the pending deposits. carefully special packages ' 'are about the ironic forges. slyly special ' ] # Write the metrics query_plan.print_metrics()
def test_group_count(): """Tests a group by query with a count aggregate :return: None """ num_rows = 0 query_plan = QueryPlan() # Query plan # select s_nationkey, count(s_suppkey) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group( ['_3'], [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results for _ in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t: IndexedTuple.build(t, field_names)['_0'] == '24', c.tuples())[0] assert nation_24[1] == 393 assert num_rows == 25 + 1 # Write the metrics query_plan.print_metrics()
def test_aggregate_count(): """Tests a group by query with a count aggregate :return: None """ query_plan = QueryPlan() # Query plan # select count(*) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['_0'] tuples = c.tuples() assert tuples[0] == field_names assert IndexedTuple.build(tuples[1], field_names)['_0'] == 10000 assert len(tuples) == 1 + 1 # Write the metrics query_plan.print_metrics()
def test_sort_desc(): """Executes a sorted query. The results are collated. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object ' 'limit 3;', False, 'ts', query_plan, False)) s = query_plan.add_operator( Sort([SortExpression('_5', float, 'DESC')], 's', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) ts.connect(s) s.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 3 + 1 assert c.tuples()[0] == ['_0', '_1', '_2', '_3', '_4', '_5', '_6'] assert c.tuples()[1] == [ '1', 'Supplier#000000001', ' N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ', '17', '27-918-335-1736', '5755.94', 'each slyly above the careful' ] assert c.tuples()[2] == [ '3', 'Supplier#000000003', 'q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3', '1', '11-383-516-1199', '4192.40', 'blithely silent requests after the express dependencies are sl' ] assert c.tuples()[3] == [ '2', 'Supplier#000000002', '89eJ5ksX3ImxJQBvxObC,', '5', '15-679-861-2259', '4032.68', ' slyly bold instructions. idle dependen' ] # Write the metrics query_plan.print_metrics()
def test_aggregate_empty(): """Executes an aggregate query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, though being an aggregate query we can generate the tuple field names based on the expressions supplied. TODO: Unsure whether the aggregate operator should return field names. It makes sense in one way, but is different to how all the other operators behave. :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv limit 0 ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def test_join_empty(): """Executes a join where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'supplier_scan', query_plan, False)) supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object limit 0;', False, 'nation_scan', query_plan, False)) nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False)) supplier_nation_join = query_plan.add_operator( HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'supplier_nation_join', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) supplier_nation_join.connect_left_producer(supplier_project) supplier_nation_join.connect_right_producer(nation_project) supplier_nation_join.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in collate.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(collate.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, format_): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan Index Files upper = perc * 100 scan = map( lambda p: query_plan.add_operator( SQLTableScan( '{}/data_{}.csv'.format(path, p), "select * from S3Object " " where cast(F0 as float) < {};".format(upper), format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(scan)) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def test_scan_simple(): """Executes a scan. The results are then collated. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( TableScan('nation.csv', 'ts', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 25 + 1 assert c.tuples()[0] == ['_0', '_1', '_2', '_3'] assert c.tuples()[1] == [ '0', 'ALGERIA', '0', ' haggle. carefully final deposits detect slyly agai' ] assert c.tuples()[2] == [ '1', 'ARGENTINA', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert c.tuples()[3] == [ '2', 'BRAZIL', '1', 'y alongside of the pending deposits. carefully special packages are about ' 'the ironic forges. slyly special ' ] # Write the metrics query_plan.print_metrics()
def test_aggregate_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert round(IndexedTuple.build(c.tuples()[1], field_names)['_0'], 2) == 45103548.65 assert len(c.tuples()) == 1 + 1 # Write the metrics query_plan.print_metrics()
def test_group_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t_)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24', c.tuples())[0] assert round(nation_24[1], 2) == 1833872.56 # Write the metrics query_plan.print_metrics()
def test_filter_2(): # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102). """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ''' ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False)) ''' # using a 'use_native=True' argument will result in a None object being returned ts = query_plan.add_operator( SQLTableScan('random_strings_2.csv', 'select f1,f2 from S3Object limit 5;', Format.CSV, True, False, False, 'ts', query_plan, False)) def fn(df): df.columns = ["first_col", "second_col"] #We can change the types of the columns #df["first_col"] = df["first_col"].astype(np.int64) return df p = query_plan.add_operator(Project([], 'project', query_plan, False, fn)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() #assert 2 + 1 == len(c.tuples()) print("Tuples:") print(c.tuples()) # Write the metrics query_plan.print_metrics() print(ROOT_DIR)
def test_project_empty(): """Executes an projection query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from S3Object " "limit 0;", False, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def run(use_pandas, secure, use_native, format_): profile_file_name = os.path.join(ROOT_DIR, "../benchmark-output/" + gen_test_id() + ".prof") os.remove(profile_file_name) if os.path.exists(profile_file_name) else None print("SQL Table Scan | Settings {}".format({'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native})) query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan scan = query_plan.add_operator( SQLTableScan('lineitem.csv', "select " " * " "from " " S3Object limit 100000", format_, use_pandas, secure, use_native, 'scan', query_plan, True)) # scan.set_profiled(True, profile_file_name) null = query_plan.add_operator( Null('null', query_plan, True)) scan.connect(null) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() query_plan.print_metrics() query_plan.stop() # Write the profile # s = pstats.Stats(profile_file_name) # s.strip_dirs().sort_stats("time").print_stats() print("SQL Table Scan | Done")
def test_group_empty(): """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def test_random_scan_simple(): """Executes a random scan. The results are then collated. :return: None """ query_plan = QueryPlan() # Query plan random_col_defs = [ RandomIntColumnDef(0, 9), RandomStringColumnDef(10, 20), RandomDateColumnDef(datetime.strptime('2017-01-01', '%Y-%m-%d'), datetime.strptime('2018-01-01', '%Y-%m-%d')) ] random_table_scan = query_plan.add_operator( RandomTableScan(10, random_col_defs, 'random_table_scan', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) random_table_scan.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() collate.print_tuples() # Write the metrics query_plan.print_metrics() # Assert the results assert len(collate.tuples()) == 10 + 1 assert collate.tuples()[0] == ['_0', '_1', '_2']
def test_filter_empty(): """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) f = query_plan.add_operator( Filter( PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast( '1996-03-01', timestamp)), 'f', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def main(): parts = 32 query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p), "select * from S3Object;", Format.CSV, use_pandas=True, secure=False, use_native=False, name='scan_' + str(p), query_plan=query_plan, log_enabled=False)), range(0, parts)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan)) query_plan.execute()
def test_filter_1(): """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ts = query_plan.add_operator( SQLTableScan( 'tpch-sf1/lineitem_sharded/lineitem.csv.0', 'select * from S3Object where cast(l_extendedprice as float) >= 0 and cast(l_extendedprice as float)<= 910;', Format.CSV, True, False, False, 'ts', query_plan, False)) # using a 'use_native=True' argument will result in a None object being returned ''' ts = query_plan.add_operator( SQLTableScan('tpch-sf1/lineitem_sharded/lineitem.csv.0', 'select l_partkey from S3Object limit 5;',Format.CSV , True, False,False, 'ts', query_plan, False)) #random_strings_2.csv ''' c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) #f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() #assert 2 + 1 == len(c.tuples()) print("Tuples:") print(c.tuples()) # Write the metrics query_plan.print_metrics() print(ROOT_DIR)
def sample_table(s3key, k, sort_exp): """ Given a table name, return a random sample of records. Currently, the returned records are the first k tuples :param s3key: the s3 object name :param k: the number of tuples to return (the number added to the SQL Limit clause :param sort_exp: the sort expression in which the topk is chosen upon :return: the list of selected keys from the first k tuples in the table """ projection = "CAST({} as {})".format(sort_exp.col_name, sort_exp.col_type.__name__) sql = "SELECT {} FROM S3Object LIMIT {}".format(projection, k) q_plan = QueryPlan(None, is_async=False) select_op = q_plan.add_operator( SQLTableScan(s3key, sql, True, True, False, "sample_{}_scan".format(s3key), q_plan, False)) from copy import deepcopy sample_topk_sort_exp = deepcopy(sort_exp) sample_topk_sort_exp.col_index = '_0' topk = q_plan.add_operator( Top(max_tuples=k, sort_expression=sample_topk_sort_exp, use_pandas=True, name="sampling_topk", query_plan=q_plan, log_enabled=False)) collate = q_plan.add_operator( Collate("sample_{}_collate".format(s3key), q_plan, False)) select_op.connect(topk) topk.connect(collate) q_plan.execute() q_plan.print_metrics() return collate.tuples(), select_op, q_plan
def test_filter_arg(object_, sql_query): # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102). """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ''' ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False)) ''' # using a 'use_native=True' argument will result in a None object being returned ts = query_plan.add_operator( SQLTableScan(object_, sql_query, Format.CSV, True, False, False, 'ts', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() #assert 2 + 1 == len(c.tuples()) print("Tuples:") print(c.tuples()) # Write the metrics query_plan.print_metrics() print(ROOT_DIR)
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, queried_aliases, castable_aliases, select_str, aggregate_column, filter_str, path, format_=Format.PARQUET): secure = False use_native = False print('') print("Parquet Aggregate Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.{}.parquet".format(path, p ), "select {} from S3Object " "{};".format(select_str, filter_str), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_aliases df[castable_aliases] = df[castable_aliases].astype(np.double) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): if aggregate_column in df: return pd.DataFrame({'sum': [sum(df[aggregate_column])]}) else: return pd.DataFrame({'count': len(df)}, index=[0]) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_first_part, table_parts, queried_columns, select_columns, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling sample_scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format( path, p), 'select {} from S3Object;'.format( sort_field), format_, use_pandas, secure, use_native, 'column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Sampling project def project_fn1(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr = SortExpression(sort_field, float, sort_order) sample_topk = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(select_columns), #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False )) ' {} '.format(sort_field), 'sql_gen', query_plan, False)) # Scan scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Project def project_fn2(df): df.columns = queried_columns df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn2)), range(table_first_part, table_first_part + table_parts)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project)) sample_topk.connect(sql_gen) map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan Index Files upper = perc * 100 index_scan = map(lambda p: query_plan.add_operator( SQLTableScan('{}/index/index_f0_{}.csv'.format(path, p), "select first_byte, last_byte " " from S3Object " " where name < {};".format(upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map(lambda p: query_plan.add_operator( TableRangeAccess('{}/data_{}.csv'.format(path, p), use_pandas, secure, use_native, 'range_access_{}'.format(p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(collate), enumerate(range_access)) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_lineitem_join_build_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_bloom_create = query_plan.add_operator( tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create', query_plan)) lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op( lineitem_sharded, p, part_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_lineitem_join_probe_map) map(lambda o: o.set_async(False), part_lineitem_join_build_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) connect_many_to_one(part_project, part_bloom_create) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # part_project.connect(part_bloom_create) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), # enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), # enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) connect_many_to_many(lineitem_project, part_lineitem_join_probe_map) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round( float(tuples[1][0]), 10 ) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, format_, shards_path): secure = False use_native = False print('') print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format( sort_field, sort_order, k)) print("----------------------") stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p ), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK sort_expr = [SortExpression(sort_field, float, sort_order)] topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() query_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ 0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_head_table_sampling(stats, sort_field_index, sort_field, k, sample_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): secure = False use_native = False print('') print( "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}" .format(sort_field, sort_order, k, sample_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, 1 ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) sample_scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}.{}".format(shards_path, p), 'select {} from S3Object limit {};'.format( sort_field, per_part_samples), format_, use_pandas, secure, use_native, 'sample_scan_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Sampling project def project_fn(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda op: op.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_local_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") stats += [ 'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scan = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda (i, op): sample_scan[i].set_nthreads(n_threads), enumerate(sample_scan)) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: tbl_smpler.connect(o), sample_scan) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts_end - table_parts_start)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]