def test_sort_topk(): """Executes a sorted top k query, which must use the top operator as record sorting can't be pushed into s3. The results are collated. :return: None """ limit = 5 query_plan = QueryPlan() # Query plan ts = query_plan.add_operator(SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) s = query_plan.add_operator(Sort([ SortExpression('_5', float, 'ASC') ], 's', query_plan, False)) t = query_plan.add_operator(Limit(limit, 't', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(s) s.connect(t) t.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6'] assert len(c.tuples()) == limit + 1 assert c.tuples()[0] == field_names prev = None num_rows = 0 for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) if num_rows > 1: if prev is None: prev = t else: lt = IndexedTuple.build(t, field_names) prev_lt = IndexedTuple.build(prev, field_names) assert float(lt['_5']) > float(prev_lt['_5']) # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_where_brand_and_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_where_partkey_op( lineitem_parts != 1, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_avg_group_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild( 'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map(lambda p: query_plan.add_operator(tpch_q17.filter_brand_container_op( 'part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def main(sf, format_, parts, sharded, other_parts, table_a_filter_val, table_b_filter_val, expected_result, trial): table_a_filter_sql, _, table_b_filter_sql, _ = runner.build_filters( table_a_filter_val, table_b_filter_val) settings = SyntheticFilteredJoinSettings( parallel=True, use_pandas=True, secure=False, use_native=False, buffer_size=0, use_shared_mem=False, shared_memory_size=-1, format_=format_, sf=sf, table_A_key='customer', table_A_parts=parts, table_A_sharded=sharded, table_A_field_names=['c_custkey'], table_A_filter_sql=table_a_filter_sql, table_A_AB_join_key='c_custkey', table_B_key='orders', table_B_parts=parts, table_B_sharded=sharded, table_B_field_names=[ 'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment' ], table_B_filter_sql=table_b_filter_sql, table_B_AB_join_key='o_custkey', table_B_BC_join_key=None, table_B_detail_field_name='o_totalprice', table_C_key=None, table_C_parts=None, table_C_sharded=None, table_C_field_names=None, table_C_filter_sql=None, table_C_BC_join_key=None, table_C_detail_field_name=None, other_parts=other_parts) path = os.path.join(ROOT_DIR, "../aws-exps/join") filesystem_util.create_dirs(path) out_file = "synthetic_join_2_filtered_sf{}_aval{}_bval{}_trial{}.txt" \ .format(sf, table_a_filter_val, table_b_filter_val, trial) sys.stdout = open(os.path.join(path, out_file), "w+") print("--- TEST: {} ---".format(gen_test_id())) print("--- SCALE FACTOR: {} ---".format(sf)) print("--- FORMAT: {} ---".format(format_)) print("--- CUSTOMER FILTER: {} ---".format(table_a_filter_sql)) print("--- ORDER FILTER: {} ---".format(table_b_filter_sql)) query_plan = synthetic_join_filtered.query_plan(settings) runner.run(query_plan, expected_result=expected_result, test_id=gen_test_id()) sys.stdout.close() subprocess.call(['cat', os.path.join(path, out_file)])
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): df.columns = [ 'G0', 'G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', ] df = df.filter(items=group_fields + agg_fields, axis=1) return df # Scan scan = map( lambda p: query_plan.add_operator( TableScan(files.format(p), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) # inlined map(lambda p: p.set_async(False), groupby) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) connect_many_to_many(scan, groupby) connect_many_to_one(groupby, groupby_reduce) connect_one_to_one(groupby_reduce, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/topk_data_{}.csv".format(path, p), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = ['F0', 'F1', 'F2'] df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # TopK sort_expr = SortExpression(sort_field, 'float', sort_order) topk = map(lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(0, table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/topk/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) ########################## ## Phase 1. Find out group names ########################## # Scan scan_phase1 = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields)), format_, use_pandas, secure, use_native, 'scan_phase1_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields return df project_exprs = [ ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields) ] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): return df.drop_duplicates() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) # GroupbyFilterBuild agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field)) for agg_field in agg_fields] groupby_filter_build = query_plan.add_operator( GroupbyFilterBuild(group_fields, agg_fields, agg_exprs, 'groupby_filter_build', query_plan, False)) ########################## ## Phase 2. Perform aggregation at S3. ########################## # Scan scan_phase2 = map( lambda p: query_plan.add_operator( SQLTableScan( 'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format( p), "", format_, use_pandas, secure, use_native, 'scan_phase2_{}'.format(p), query_plan, False)), range(0, table_parts)) groupby_decoder = map( lambda p: query_plan.add_operator( GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p), query_plan, False)), range(0, table_parts)) def groupby_fn_phase2(df): #print df df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby_reduce_phase2 = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False, groupby_fn_phase2)) #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof")) #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof")) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # phase 1 map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(groupby_filter_build) # phase 2 map(lambda (p, o): groupby_filter_build.connect(o, 0), enumerate(scan_phase2)) map(lambda (p, o): groupby_filter_build.connect(o, 1), enumerate(groupby_decoder)) map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2)) map(lambda (p, o): o.connect(groupby_reduce_phase2), enumerate(groupby_decoder)) # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder)) groupby_reduce_phase2.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_lineitem_join_build_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_bloom_create = map(lambda p: query_plan.add_operator( tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_bloom_use = map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op( part_parts != 1, p, part_parts, use_pandas, 'lineitem_bloom_use' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map(lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_avg_group_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) # part_project.connect(part_bloom_create) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'] # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float) # criterion = (df['l_extendedprice'] >= lower) & ( # df['l_extendedprice'] <= upper) df[['_5']] = df[['_5']].astype(np.float) criterion = (df['_5'] >= lower) & (df['_5'] <= upper) return df[criterion] # scan the file scan_filter = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', True, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter)) aggregate.connect(collate) # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof")) # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof")) # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof")) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Baseline Join") print("---------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('customer', customer_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=customer_filter_sql is None) ), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0', '_6'], axis=1) df.rename(columns={ '_0': 'c_custkey', '_6': 'c_mktsegment' }, inplace=True) return df customer_project = map( lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) def customer_filter_fn(df): return df['c_mktsegment'] == 'BUILDING' customer_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=customer_filter_fn), 'customer_filter' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) customer_map = map( lambda p: query_plan.add_operator( Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False) ), range(0, customer_parts)) order_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('orders', order_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=order_filter_sql is None) ), format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_4', '_7'], axis=1) df.rename(columns={ '_1': 'o_custkey', '_0': 'o_orderkey', '_4': 'o_orderdate', '_7': 'o_shippriority' }, inplace=True) return df order_project = map( lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) def order_filter_fn(df): return pd.to_datetime(df['o_orderdate']) < pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) order_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=order_filter_fn), 'order_filter' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) order_map_1 = map( lambda p: query_plan.add_operator( Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', lineitem_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + lineitem_filter_sql if lineitem_filter_sql is not None else "", get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=lineitem_filter_sql is None) ), format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_5', '_6', '_10'], axis=1) df.rename(columns={ '_0': 'l_orderkey', '_5': 'l_extendedprice', '_6': 'l_discount', '_10': 'l_shipdate' }, inplace=True) return df lineitem_project = map( lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) def lineitem_filter_fn(df): return pd.to_datetime(df['l_shipdate']) > pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) lineitem_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=lineitem_filter_fn), 'lineitem_filter' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) order_map_2 = map( lambda p: query_plan.add_operator( Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False) ), range(0, other_parts)) customer_order_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map( lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_[ 'l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [ SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC') ], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), customer_filter) map(lambda o: o.set_async(False), order_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_filter) connect_many_to_many(customer_filter, customer_map) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_filter) connect_many_to_many(order_filter, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(lineitem_filter, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(parallel, buffer_size, parts): """ :return: None """ print('') print("TPCH Q14 Partitioned Bloom Join") print("-------------------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan)) sum_aggregate = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'sum_aggregate', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) hash_join_build_ops = [] hash_join_probe_ops = [] part_bloom_create_ops = [] line_item_scan_ops = [] for p in range(0, parts): part_scan = query_plan.add_operator( tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts, 'part_scan' + '_' + str(p), query_plan)) part_scan_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan)) part_bloom_create = query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan)) part_bloom_create_ops.append(part_bloom_create) lineitem_scan = query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p, parts, 'lineitem_scan' + '_' + str(p), query_plan)) line_item_scan_ops.append(lineitem_scan) lineitem_scan_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p), query_plan)) join_build = query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)) hash_join_build_ops.append(join_build) join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)) hash_join_probe_ops.append(join_probe) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan)) part_scan.connect(part_scan_project) lineitem_scan.connect(lineitem_scan_project) part_scan_project.connect(part_bloom_create) # part_bloom_create.connect(lineitem_scan) part_scan_project.connect(join_build) join_probe.connect(aggregate) # aggregate.connect(aggregate_project) aggregate.connect(sum_aggregate) join_probe.connect_tuple_producer(lineitem_scan_project) # join_probe.connect(merge) for bloom_create_op in part_bloom_create_ops: for scan_op in line_item_scan_ops: bloom_create_op.connect(scan_op) for probe_op in hash_join_probe_ops: for build_op in hash_join_build_ops: probe_op.connect_build_producer(build_op) sum_aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print("parts: {}".format(parts)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL # assert tuples[1] == [15.090116526324298] assert round(float(tuples[1][0]), 10) == 15.0901165263
def test_group_baseline(): """TPC-H Q1 :return: None """ # Query plan # select # l_returnflag, # l_linestatus, # sum(l_quantity) as sum_qty, # sum(l_extendedprice) as sum_base_price, # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, # avg(l_quantity) as avg_qty, # avg(l_extendedprice) as avg_price, # avg(l_discount) as avg_disc, # count(*) as count_order # from # lineitem # where # l_shipdate <= date '1992-04-01' - interval '[DELTA]' day(3) # group by # l_returnflag, # l_linestatus # order by # l_returnflag, # l_linestatus; query_plan = QueryPlan() delta_days = 60 # TODO: This is supposed to be randomized I think shipped_date = datetime.strptime('1992-04-01', '%Y-%m-%d') - timedelta(days=delta_days) ts = query_plan.add_operator(SQLTableScan("lineitem.csv", "select * from S3Object " "where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp)" .format(shipped_date.strftime('%Y-%m-%d')), False, 'lineitem', query_plan, False)) g = query_plan.add_operator(Group( [ '_8', # l_returnflag '_9' # l_linestatus ], [ # sum(l_quantity) AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_4'])), # sum(l_extendedprice) as sum_base_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])), # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6']))), # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6'])) * (1 + float(t_['_7']))), # avg(l_quantity) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_4'])), # avg(l_extendedprice) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_5'])), # avg(l_discount) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_6'])), # count(*) as count_order AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) ], 'lineitem_grouped', query_plan, False)) s = query_plan.add_operator(Sort( [ SortExpression('_0', str, 'ASC'), SortExpression('_1', str, 'ASC') ], 'lineitem_group_sorted', query_plan, False)) c = query_plan.add_operator(Collate('collation', query_plan, False)) ts.connect(g) g.connect(s) s.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 2 + 1 # These have been verified in Postgres assert c.tuples()[1] == ["A", "F", 129850, 194216048.19000033, 184525343.78730044, 191943492.96455324, 25.445816186556925, 38059.19031746035, 0.050005878894768374, 5103] assert c.tuples()[2] == ["R", "F", 129740, 193438367.33999985, 183701990.7670003, 191045646.36937532, 25.509241053873353, 38033.49731419579, 0.05061541486433399, 5086] # Write the metrics query_plan.print_metrics()
def run(parallel, buffer_size): """Tests a with sharded operators and separate build and probe for join :return: None """ query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan parts = 2 collate = query_plan.add_operator(Collate('collate', query_plan, False)) merge = query_plan.add_operator(Merge('merge', query_plan, False)) hash_join_build_ops = [] hash_join_probe_ops = [] for p in range(1, parts + 1): r_region_key_lower = math.ceil((5.0 / float(parts)) * (p - 1)) r_region_key_upper = math.ceil((5.0 / float(parts)) * p) region_scan = query_plan.add_operator( SQLTableScan('region.csv', 'select * ' 'from S3Object ' 'where cast(r_regionkey as int) >= {} and cast(r_regionkey as int) < {};' .format(r_region_key_lower, r_region_key_upper), False, 'region_scan' + '_' + str(p), query_plan, False)) region_project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'r_regionkey'), ProjectExpression(lambda t_: t_['_1'], 'r_name') ], 'region_project' + '_' + str(p), query_plan, False)) n_nation_key_lower = math.ceil((25.0 / float(parts)) * (p - 1)) n_nation_key_upper = math.ceil((25.0 / float(parts)) * p) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'where cast(n_nationkey as int) >= {} and cast(n_nationkey as int) < {};' .format(n_nation_key_lower, n_nation_key_upper), False, 'nation_scan' + '_' + str(p), query_plan, False)) nation_project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_1'], 'n_name'), ProjectExpression(lambda t_: t_['_2'], 'n_regionkey') ], 'nation_project' + '_' + str(p), query_plan, False)) region_hash_join_build = query_plan.add_operator( HashJoinBuild('r_regionkey', 'region_hash_join_build' + '_' + str(p), query_plan, False)) hash_join_build_ops.append(region_hash_join_build) region_nation_join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('r_regionkey', 'n_regionkey'), 'region_nation_join_probe' + '_' + str(p), query_plan, False)) hash_join_probe_ops.append(region_nation_join_probe) region_scan.connect(region_project) nation_scan.connect(nation_project) region_project.connect(region_hash_join_build) region_nation_join_probe.connect_tuple_producer(nation_project) region_nation_join_probe.connect(merge) for probe_op in hash_join_probe_ops: for build_op in hash_join_build_ops: probe_op.connect_build_producer(build_op) merge.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['r_regionkey', 'r_name', 'n_nationkey', 'n_name', 'n_regionkey'] assert len(tuples) == 25 + 1 assert tuples[0] == field_names num_rows = 0 for t in tuples: num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['r_regionkey'] == lt['n_regionkey']
def run(sf, parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, sharded, format_): """ :return: None """ print('') print("TPCH Q1 Filtered Group By") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan date = '1998-12-01' max_shipped_date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=117) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q1.sql_filtered_scan_lineitem_operator_def( max_shipped_date, sharded, p, sf, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str( p), query_plan, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q1.project_lineitem_from_filtered_scan_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) groupby = map( lambda p: query_plan.add_operator( tpch_q1.groupby_returnflag_linestatus_operator_def( 'groupby' + '_' + str(p), query_plan)), range(0, lineitem_parts)) groupby_reduce = query_plan.add_operator( tpch_q1.groupby_reduce_returnflag_linestatus_operator_def( 'groupby_reduce', query_plan, use_pandas)) # lineitem_scan[0].set_profiled(True, os.path.join(ROOT_DIR, "../tests-output/", gen_test_id() + "_scan_0" + ".prof")) # lineitem_project[0].set_profiled(True, # os.path.join(ROOT_DIR, "../tests-output/", gen_test_id() + "_project_0" + ".prof")) # groupby[0].set_profiled(True, os.path.join(ROOT_DIR, "../tests-output/", gen_test_id() + "_groupby_0" + ".prof")) # groupby_reduce.set_profiled(True, # os.path.join(ROOT_DIR, "../tests-output/", gen_test_id() + "_groupby_reduce" + ".prof")) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: o.set_async(False), lineitem_project) # map(lambda o: o.set_async(False), groupby) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(groupby[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set. date = '1996-03-13' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_extra_filtered_operator_def( lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(join_build)), enumerate(join_probe)) map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o), enumerate(lineitem_filter)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 33.4262326420
def test_operators(): system = WorkerSystem() query_plan = QueryPlan(system, is_async=True, buffer_size=0) # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'limit 3;', True, False, False, 'scan', query_plan, True)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_1'], 'n_name'), ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'project', query_plan, True)) c = query_plan.add_operator(Collate('collate', query_plan, True)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = c.tuples() c.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment'] assert len(tuples) == 3 + 1 assert tuples[0] == field_names assert tuples[1] == [ '0', 'ALGERIA', '0', ' haggle. carefully final deposits detect slyly agai' ] assert tuples[2] == [ '1', 'ARGENTINA', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert tuples[3] == [ '2', 'BRAZIL', '1', 'y alongside of the pending deposits. carefully special packages ' 'are about the ironic forges. slyly special ' ]
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields + agg_fields)), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields + agg_fields return df project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \ + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded): """ :return: None """ print('') print("TPCH Q19 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_partjoin_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_part_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) filter_op = map(lambda p: query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate = map(lambda p: query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Connect the operators # lineitem_scan.connect(lineitem_project) connect_many_to_many(lineitem_scan, lineitem_project) connect_all_to_all(lineitem_project, lineitem_map) # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_many_to_many(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """The filtered tst uses hash joins but first projections and filtering is pushed down to s3. :return: None """ print('') print("TPCH Q14 Filtered Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = query_plan.add_operator( tpch_q14. sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_sharded_operator_def( min_shipped_date, max_shipped_date, False, 0, use_pandas, 'lineitem_scan', query_plan)) lineitem_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_project', query_plan)) part_scan = query_plan.add_operator( tpch_q14. sql_scan_part_partkey_type_part_where_brand12_partitioned_operator_def( 0, 1, use_pandas, 'part_scan', query_plan)) part_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_project', query_plan)) join = query_plan.add_operator( tpch_q14.join_lineitem_part_operator_def('join', query_plan)) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) lineitem_scan.connect(lineitem_project) part_scan.connect(part_project) join.connect_left_producer(lineitem_project) join.connect_right_producer(part_project) join.connect(aggregate) aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert tuples[1] == [15.090116526324298]
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas, buffer_size, table_parts, lower, upper, sharded, sf, use_shared_mem, inline_ops, merge_ops): secure = False use_native = False if use_shared_mem: system = WorkerSystem(shared_mem_buffer_size) else: system = None query_plan = QueryPlan(system, is_async=parallel, buffer_size=buffer_size, use_shared_mem=use_shared_mem) # scan the file scan = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) def filter_fn(df): return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype( np.float) <= upper) filter_ = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p), query_plan, False)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda p: p.set_async(not inline_ops), filter_) connect_many_to_many(scan, filter_) connect_many_to_one(filter_, aggregate) connect_one_to_one(aggregate, collate) profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops, use_shared_mem) scan[0].set_profiled( True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + profile_file_suffix + ".prof")) filter_[0].set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filter_0" + profile_file_suffix + ".prof")) aggregate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + profile_file_suffix + ".prof")) collate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + profile_file_suffix + ".prof")) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_operator_def( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str( p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_filter = map(lambda p: # query_plan.add_operator( # tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), # range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() sum2_ = df['_1'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) connect_all_to_all(part_map, join_build) connect_many_to_many(join_build, join_probe) connect_many_to_many(lineitem_filter, lineitem_map) connect_all_to_all(lineitem_map, join_probe) connect_many_to_many(join_probe, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(sort_field, k, sample_size, parallel, use_pandas, sort_order, buffer_size, table_first_part, table_parts, queried_columns, select_columns, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, Column Sampling and Scan. Sort Field: {}, Order: {}".format(sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling per_part_samples = int(sample_size / table_parts) sample_scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), 'select {} from S3Object limit {};'.format(sort_field, per_part_samples),format_, use_pandas, secure, use_native, 'sample_column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Column project def project_fn1(df): df.columns = [sort_field] df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'column_sample_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr1 = SortExpression(sort_field, float, sort_order) sample_topk = query_plan.add_operator( Top(k, sort_expr1, use_pandas, 'column_sample_topk', query_plan, False)) # Generate SQL command for full column scan sql_gen1 = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(sort_field), #' CAST({} as float) '.format(sort_field), 'sql_gen1', query_plan, False )) ' {} '.format(sort_field), 'sql_gen1', query_plan, False )) # Scanning entire column column_scan = map(lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan("{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",format_, use_pandas, secure, use_native, 'column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) column_project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'column_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr2 = SortExpression(sort_field, float, sort_order) column_topk = query_plan.add_operator( Top(k, sort_expr2, use_pandas, 'sample_topk', query_plan, False)) # Generate SQL command for full scan of all selected columns sql_gen2 = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(select_columns), #' CAST({} as float) '.format(sort_field), 'sql_gen2', query_plan, False )) ' {} '.format(sort_field), 'sql_gen2', query_plan, False )) # Scan scan = map(lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan("{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Project def project_fn2(df): df.columns = queried_columns df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn2)), range(table_first_part, table_first_part + table_parts)) # TopK topk = map(lambda p: query_plan.add_operator( Top(k, sort_expr2, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr2, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project)) sample_topk.connect(sql_gen1) map(lambda (p, o): sql_gen1.connect(o), enumerate(column_scan)) map(lambda (p, o): o.connect(column_project[p]), enumerate(column_scan)) map(lambda (p, o): o.connect(column_topk), enumerate(column_project)) column_topk.connect(sql_gen2) map(lambda (p, o): sql_gen2.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q19 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_bloom_create = query_plan.add_operator( tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan, fp_rate)) part_scan = map( lambda p: query_plan.add_operator( tpch_q19. sql_scan_part_partkey_brand_size_container_where_filtered_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_project = \ map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_filtered_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_bloom_create_map = map(lambda p: # query_plan.add_operator( # Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)), # range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) filter_op = map( lambda p: query_plan.add_operator( tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, other_parts)) aggregate = map( lambda p: query_plan.add_operator( tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan, use_pandas)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), filter_op) map(lambda o: o.set_async(False), aggregate) aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # part_project.connect(part_bloom_create) connect_many_to_one(part_project, part_bloom_create) # part_bloom_create.connect(lineitem_bloom_use) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # lineitem_bloom_use.connect(lineitem_project) connect_many_to_many(lineitem_bloom_use, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_all_to_all(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) assert sf == 1 or sf == 10 # Scan Index Files index_scan = map(lambda p: query_plan.add_operator( SQLTableScan('tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}'.format(sf, p if sf == 1 else p + 1), "select first_byte, last_byte " " from S3Object " " where cast(col_values as float) >= {} and cast(col_values as float) <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map(lambda p: query_plan.add_operator( TableRangeAccess(get_file_key('lineitem', True, p, sf=sf,format_=Format.CSV), use_pandas, secure, use_native, 'range_access_{}'.format(p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) # Aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(aggregate), enumerate(range_access)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, select_columns, lower, upper, path, format_=Format.CSV): secure = False use_native = False print('') print("CSV Filter+Project Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.tbl.{}".format(path, p ), "select {} from S3Object " "where cast(l_orderkey as int) >= {} and cast(l_orderkey as int) <= {};" .format(select_columns, lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_columns df[['l_orderkey']] = df[['l_orderkey']].astype(np.int64) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def test_pandas_filter_baseline(): """ :return: """ query_plan = QueryPlan(buffer_size=8192) # Query plan ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;', True, 'ts', query_plan, False)) # f = query_plan.add_operator( # Filter(PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast('1996-03-01', timestamp)), # 'f', query_plan, # False)) def pd_expr(df): # df['_10'] = pd.to_datetime(df['_10']) return pd.to_datetime(df['_10']) >= '1996-03-01' f = query_plan.add_operator( Filter(PredicateExpression(None, pd_expr), 'f', query_plan, True)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert 2 + 1 == len(c.tuples()) field_names = [ '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15' ] assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O', '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK', 'egular courts above the' ] assert c.tuples()[2] == [ '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O', '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL', 'ly final dependencies: slyly bold ' ] # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded): """ :return: None """ print('') print("TPCH Q14 Semi Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) part_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_partkey_where_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_1' + '_' + str(p), query_plan)), range(0, part_parts)) part_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_p_partkey_operator_def( 'part_1_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def( 'part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_l_partkey_operator_def( 'lineitem_1_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_1_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_1_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_1_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) join_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_l_partkey_operator_def( 'join_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) part_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_2' + '_' + str(p), query_plan)), range(0, part_parts)) part_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_type_operator_def( 'part_2_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_2_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_2_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_2_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_2_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1)) map(lambda (p, o): o.connect(part_bloom_create[p]), enumerate(part_1_project)), map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(part_bloom_create)), enumerate(lineitem_scan_1)) map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]), enumerate(part_1_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_1_build)), enumerate(part_lineitem_join_1_probe)) map(lambda (p, o): o.connect(lineitem_1_project[p]), enumerate(lineitem_scan_1)) map( lambda (p, o): part_lineitem_join_1_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_1_project)) map(lambda (p, o): o.connect(join_bloom_create[p]), enumerate(part_lineitem_join_1_probe)) map( lambda (p, o): map(lambda (bp, bo): o.connect(bo), enumerate(part_scan_2)), enumerate(join_bloom_create)) map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(join_bloom_create)), enumerate(lineitem_scan_2)) map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2)) map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]), enumerate(part_2_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_2_build)), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(lineitem_2_project[p]), enumerate(lineitem_scan_2)) map( lambda (p, o): part_lineitem_join_2_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_2_project)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
def test_filter_baseline(): """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ''' ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False)) ''' # using a 'use_native=True' argument will result in a None object being returned ts = query_plan.add_operator( SQLTableScan('random_strings_2.csv', 'select * from S3Object limit 3;', Format.CSV, True, False, False, 'ts', query_plan, False)) f = query_plan.add_operator( Filter( PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast( '1996-03-01', timestamp)), 'f', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) print(len(c.tuples())) assert 2 + 1 == len(c.tuples()) field_names = [ '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15' ] assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O', '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK', 'egular courts above the' ] assert c.tuples()[2] == [ '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O', '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL', 'ly final dependencies: slyly bold ' ] # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file ''' scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_), "select * from S3Object " "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', True, p, sf=sf, format_=format_ ), "select * from S3Object " "where l_extendedprice >= {} and l_extendedprice <= {};". format(lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # project def fn(df): df.columns = [ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def test_join_baseline_pandas(): """Tests a join :return: None """ query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('region.csv', 'select * from S3Object;', True, False, False, 'supplier_scan', query_plan, True)) def supplier_project_fn(df): df = df.filter(['_0'], axis='columns') df = df.rename(columns={'_0': 'r_regionkey'}) return df supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'r_regionkey')], 'supplier_project', query_plan, True, supplier_project_fn)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object;', True, False, False, 'nation_scan', query_plan, True)) def nation_project_fn(df): df = df.filter(['_2'], axis='columns') df = df.rename(columns={'_2': 'n_regionkey'}) return df nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_2'], 'n_regionkey')], 'nation_project', query_plan, True, nation_project_fn)) supplier_nation_join_build = query_plan.add_operator( HashJoinBuild('n_regionkey', 'supplier_nation_join_build', query_plan, True)) supplier_nation_join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('n_regionkey', 'r_regionkey'), 'supplier_nation_join_probe', query_plan, True)) collate = query_plan.add_operator(Collate('collate', query_plan, True)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) nation_project.connect(supplier_nation_join_build) supplier_nation_join_probe.connect_build_producer(supplier_nation_join_build) supplier_nation_join_probe.connect_tuple_producer(supplier_project) supplier_nation_join_probe.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['n_regionkey', 'r_regionkey'] assert len(tuples) == 25 + 1 assert tuples[0] == field_names num_rows = 0 for t in tuples: num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['n_regionkey'] == lt['r_regionkey']