def test_aggregate_count(): """Tests a group by query with a count aggregate :return: None """ query_plan = QueryPlan() # Query plan # select count(*) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['_0'] tuples = c.tuples() assert tuples[0] == field_names assert IndexedTuple.build(tuples[1], field_names)['_0'] == 10000 assert len(tuples) == 1 + 1 # Write the metrics query_plan.print_metrics()
def aggregate_def(name, query_plan, use_pandas): def fn(df): sum1_ = (df['l_extendedprice'].astype(np.float) * (1 - df['l_discount'].astype(np.float))).sum() return pd.DataFrame({'_0': [sum1_]}) return Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice']) * float( (1 - float(t_['l_discount'])))) ], use_pandas, name, query_plan, False, fn)
def test_aggregate_empty(): """Executes an aggregate query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, though being an aggregate query we can generate the tuple field names based on the expressions supplied. TODO: Unsure whether the aggregate operator should return field names. It makes sense in one way, but is different to how all the other operators behave. :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv limit 0 ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def test_aggregate_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert round(IndexedTuple.build(c.tuples()[1], field_names)['_0'], 2) == 45103548.65 assert len(c.tuples()) == 1 + 1 # Write the metrics query_plan.print_metrics()
def aggregate_sum_extendedprice_op(use_pandas, name, query_plan): """with extendedprice_sum_aggregate as ( select sum(l_extendedprice) from filter_join_2 ) :param query_plan: :param name: :return: """ def fn(df): sum1_ = df['l_extendedprice'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) return Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'])) ], use_pandas, name, query_plan, False, fn)
def aggregate_promo_revenue_operator_def(name, query_plan): # type: (str, QueryPlan) -> Aggregate def ex1(t_): v1 = float(t_['l_extendedprice']) * (1.0 - float(t_['l_discount'])) rx = re.compile('^PROMO.*$') if rx.search(t_['p_type']): # p_type v2 = v1 else: v2 = 0.0 return v2 def ex2(t_): v1 = float(t_['l_extendedprice']) * (1.0 - float(t_['l_discount'])) return v1 def agg_fn(df): mask = df['p_type'].str.startswith('PROMO') df1 = df[mask]['l_extendedprice'].astype( np.float) * (1.0 - df[mask]['l_discount'].astype(np.float)) df2 = df['l_extendedprice'].astype( np.float) * (1.0 - df['l_discount'].astype(np.float)) sum1 = df1.sum() sum2 = df2.sum() df5 = pd.DataFrame({'_0': [sum1], '_1': [sum2]}) return df5 return Aggregate([ AggregateExpression(AggregateExpression.SUM, ex1), AggregateExpression(AggregateExpression.SUM, ex2) ], True, name, query_plan, False, agg_fn)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_where_brand_and_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_where_partkey_op( lineitem_parts != 1, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_avg_group_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild( 'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, select_columns, lower, upper, path, format_=Format.CSV): secure = False use_native = False print('') print("CSV Filter+Project Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.tbl.{}".format(path, p ), "select {} from S3Object " "where cast(l_orderkey as int) >= {} and cast(l_orderkey as int) <= {};" .format(select_columns, lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_columns df[['l_orderkey']] = df[['l_orderkey']].astype(np.int64) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q19 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_bloom_create = query_plan.add_operator( tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan, fp_rate)) part_scan = map( lambda p: query_plan.add_operator( tpch_q19. sql_scan_part_partkey_brand_size_container_where_filtered_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_project = \ map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_filtered_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_bloom_create_map = map(lambda p: # query_plan.add_operator( # Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)), # range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) filter_op = map( lambda p: query_plan.add_operator( tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, other_parts)) aggregate = map( lambda p: query_plan.add_operator( tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan, use_pandas)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), filter_op) map(lambda o: o.set_async(False), aggregate) aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # part_project.connect(part_bloom_create) connect_many_to_one(part_project, part_bloom_create) # part_bloom_create.connect(lineitem_bloom_use) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # lineitem_bloom_use.connect(lineitem_project) connect_many_to_many(lineitem_bloom_use, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_all_to_all(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def query_plan(settings): # type: (SyntheticBaselineJoinSettings) -> QueryPlan """ :type settings: :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fun(df): df.columns = settings.table_A_field_names criterion = settings.table_A_filter_fn(df) df = df[criterion] return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fun)), range(0, settings.table_A_parts)) def scan_B_fun(df): df.columns = settings.table_B_field_names criterion = settings.table_B_filter_fn(df) return df[criterion] scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fun)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) filter_b = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) """ def scan_C_fun(df): df.columns = settings.table_C_field_names criterion = settings.table_C_filter_fn(df) return df[criterion] scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fun)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) filter_c = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def agg_fun(df): return pd.DataFrame({ 'sum': [df[settings.table_C_detail_field_name].astype(float).sum()] }) #return pd.DataFrame( { 'sum' : [ len(df) ] } ) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, agg_fun)), range(0, settings.table_C_parts)) def agg_reduce_fun(df): return pd.DataFrame({'sum': [df['sum'].sum()]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, agg_reduce_fun)) """ aggregate_project = query_plan.add_operator(Project( [ ProjectExpression(lambda t: t['_0'], 'total_balance') ], 'aggregate_project', query_plan, False)) """ collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(project_A, filter_A) #connect_many_to_many(filter_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(project_B, filter_b) #connect_many_to_many(filter_b, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) #connect_many_to_many(join_probe_A_B, part_aggregate) #connect_many_to_one(part_aggregate, aggregate_reduce) #connect_one_to_one(aggregate_reduce, collate) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(project_C, filter_c) #connect_many_to_many(filter_c, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, collate) #connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_lineitem_join_build_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_bloom_create = query_plan.add_operator( tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create', query_plan)) lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op( lineitem_sharded, p, part_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_lineitem_join_probe_map) map(lambda o: o.set_async(False), part_lineitem_join_build_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) connect_many_to_one(part_project, part_bloom_create) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # part_project.connect(part_bloom_create) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), # enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), # enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) connect_many_to_many(lineitem_project, part_lineitem_join_probe_map) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round( float(tuples[1][0]), 10 ) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map(lambda p: query_plan.add_operator(tpch_q17.filter_brand_container_op( 'part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def query_plan(settings): # type: (SyntheticFilteredJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_A_field_names), ' where {} '.format( settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '', get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=settings.table_A_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_A.iteritems() ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_B_field_names), ' where {} '.format( settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '', get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=settings.table_B_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_B.iteritems() ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_C.iteritems() ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) map(lambda o: o.set_async(False), project_B) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, buffer_size, selectivity1not0, typed, path, format_): secure = False use_native = False print('') print("CSV/Parquet Filter Return 1 Column Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) if selectivity1not0: if typed: # SQL scan the file scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where F0 >= 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: # SQL scan the file scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where cast(F0 as int) >= 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: # selectivity = 0 if typed: scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where F0 < 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where cast(F0 as int) < 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) # project def fn(df): df.columns = ["F0"] return df project = map(lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, 1)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(1)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, buffer_size, parts): """ :return: None """ print('') print("TPCH Q14 Partitioned Bloom Join") print("-------------------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan)) sum_aggregate = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'sum_aggregate', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) hash_join_build_ops = [] hash_join_probe_ops = [] part_bloom_create_ops = [] line_item_scan_ops = [] for p in range(0, parts): part_scan = query_plan.add_operator( tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts, 'part_scan' + '_' + str(p), query_plan)) part_scan_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan)) part_bloom_create = query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan)) part_bloom_create_ops.append(part_bloom_create) lineitem_scan = query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p, parts, 'lineitem_scan' + '_' + str(p), query_plan)) line_item_scan_ops.append(lineitem_scan) lineitem_scan_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p), query_plan)) join_build = query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)) hash_join_build_ops.append(join_build) join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)) hash_join_probe_ops.append(join_probe) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan)) part_scan.connect(part_scan_project) lineitem_scan.connect(lineitem_scan_project) part_scan_project.connect(part_bloom_create) # part_bloom_create.connect(lineitem_scan) part_scan_project.connect(join_build) join_probe.connect(aggregate) # aggregate.connect(aggregate_project) aggregate.connect(sum_aggregate) join_probe.connect_tuple_producer(lineitem_scan_project) # join_probe.connect(merge) for bloom_create_op in part_bloom_create_ops: for scan_op in line_item_scan_ops: bloom_create_op.connect(scan_op) for probe_op in hash_join_probe_ops: for build_op in hash_join_build_ops: probe_op.connect_build_producer(build_op) sum_aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print("parts: {}".format(parts)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL # assert tuples[1] == [15.090116526324298] assert round(float(tuples[1][0]), 10) == 15.0901165263
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_lineitem_join_build_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_bloom_create = map(lambda p: query_plan.add_operator( tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_bloom_use = map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op( part_parts != 1, p, part_parts, use_pandas, 'lineitem_bloom_use' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map(lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_avg_group_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) # part_project.connect(part_bloom_create) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def query_plan(settings): # type: (SyntheticBloomJoinSettings) -> QueryPlan """ :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fn(df): df.columns = settings.table_A_field_names return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_A_field_names), settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fn)), range(0, settings.table_A_parts)) """ field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df.rename(columns=field_names_map_A, inplace=True) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, True, project_fn_A)), range(0, settings.table_A_parts)) """ bloom_create_a = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'. format(p), query_plan, False)), range(0, settings.table_A_parts)) def scan_B_fn(df): df.columns = settings.table_B_field_names return df scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_B_field_names), settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.table_B_AB_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fn)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) """ def scan_C_fn(df): df.columns = settings.table_C_field_names return df scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fn)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_ab = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(scan_A, project_A) #connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) #connect_many_to_many(project_A, bloom_create_a) connect_many_to_many(scan_A, bloom_create_a) connect_all_to_all(bloom_create_a, scan_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(scan_B, project_B) #connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) connect_many_to_many(join_probe_A_B, bloom_create_ab) connect_all_to_all(bloom_create_ab, scan_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(scan_C, project_C) #connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'] # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float) # criterion = (df['l_extendedprice'] >= lower) & ( # df['l_extendedprice'] <= upper) df[['_5']] = df[['_5']].astype(np.float) criterion = (df['_5'] >= lower) & (df['_5'] <= upper) return df[criterion] # scan the file scan_filter = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', True, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter)) aggregate.connect(collate) # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof")) # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof")) # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof")) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded): """ :return: None """ print('') print("TPCH Q14 Semi Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) part_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_partkey_where_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_1' + '_' + str(p), query_plan)), range(0, part_parts)) part_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_p_partkey_operator_def( 'part_1_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def( 'part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_l_partkey_operator_def( 'lineitem_1_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_1_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_1_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_1_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) join_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_l_partkey_operator_def( 'join_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) part_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_2' + '_' + str(p), query_plan)), range(0, part_parts)) part_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_type_operator_def( 'part_2_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_2_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_2_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_2_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_2_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1)) map(lambda (p, o): o.connect(part_bloom_create[p]), enumerate(part_1_project)), map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(part_bloom_create)), enumerate(lineitem_scan_1)) map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]), enumerate(part_1_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_1_build)), enumerate(part_lineitem_join_1_probe)) map(lambda (p, o): o.connect(lineitem_1_project[p]), enumerate(lineitem_scan_1)) map( lambda (p, o): part_lineitem_join_1_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_1_project)) map(lambda (p, o): o.connect(join_bloom_create[p]), enumerate(part_lineitem_join_1_probe)) map( lambda (p, o): map(lambda (bp, bo): o.connect(bo), enumerate(part_scan_2)), enumerate(join_bloom_create)) map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(join_bloom_create)), enumerate(lineitem_scan_2)) map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2)) map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]), enumerate(part_2_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_2_build)), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(lineitem_2_project[p]), enumerate(lineitem_scan_2)) map( lambda (p, o): part_lineitem_join_2_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_2_project)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded): """ :return: None """ print('') print("TPCH Q19 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_partjoin_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_part_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) filter_op = map(lambda p: query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate = map(lambda p: query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Connect the operators # lineitem_scan.connect(lineitem_project) connect_many_to_many(lineitem_scan, lineitem_project) connect_all_to_all(lineitem_project, lineitem_map) # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_many_to_many(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', True, p, sf=sf, format_=format_ ), "select * from S3Object " "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};" .format(lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_), "select * from S3Object " "where l_extendedprice >= {} and l_extendedprice <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' # project def fn(df): df.columns = [ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas, buffer_size, table_parts, lower, upper, sharded, sf, use_shared_mem, inline_ops, merge_ops): secure = False use_native = False if use_shared_mem: system = WorkerSystem(shared_mem_buffer_size) else: system = None query_plan = QueryPlan(system, is_async=parallel, buffer_size=buffer_size, use_shared_mem=use_shared_mem) # scan the file scan = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) def filter_fn(df): return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype( np.float) <= upper) filter_ = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p), query_plan, False)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda p: p.set_async(not inline_ops), filter_) connect_many_to_many(scan, filter_) connect_many_to_one(filter_, aggregate) connect_one_to_one(aggregate, collate) profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops, use_shared_mem) scan[0].set_profiled( True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + profile_file_suffix + ".prof")) filter_[0].set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filter_0" + profile_file_suffix + ".prof")) aggregate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + profile_file_suffix + ".prof")) collate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + profile_file_suffix + ".prof")) return query_plan
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) assert sf == 1 or sf == 10 # Scan Index Files index_scan = map( lambda p: query_plan.add_operator( SQLTableScan( 'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}' .format(sf, p if sf == 1 else p + 1), "select first_byte, last_byte " " from S3Object " " where col_values >= {} and col_values <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map( lambda p: query_plan.add_operator( TableRangeAccess( get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV), use_pandas, secure, use_native, 'range_access_{}'.format( p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) # Aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(aggregate), enumerate(range_access)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set. date = '1996-03-13' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_extra_filtered_operator_def( lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(join_build)), enumerate(join_probe)) map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o), enumerate(lineitem_filter)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 33.4262326420
def query_plan(settings): # type: (SyntheticSemiJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_a = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_A_AB_join_key, settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_a' + '_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_a = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_a(df): df = df.rename(columns=field_names_map_a, copy=False) return df project_a = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_a.iteritems() ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a) ), range(0, settings.table_A_parts)) bloom_create_ab_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_ab_join_key' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_A_parts)) scan_b_on_ab_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_BC_join_key, settings.table_B_AB_join_key, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_AB_join_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_b_on_ab_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: scan_b_detail_on_b_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_primary_key, settings.table_B_detail_field_name, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_primary_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_b_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b_detail = OrderedDict([ ('_0', settings.table_B_primary_key), ('_1', settings.table_B_detail_field_name) ]) def project_fn_b_detail(df): df.rename(columns=field_names_map_b_detail, inplace=True) return df project_b_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b_detail.iteritems() ], 'project_b_detail' + '_{}'.format(p), query_plan, False, project_fn_b_detail)), range(0, settings.table_B_parts)) map_b_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_b_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_b_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_primary_key, 'bloom_create_b_pk' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_B_parts)) join_probe_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_primary_key, settings. table_B_primary_key), 'join_probe_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_build_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_primary_key, 'join_build_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) else: scan_c_on_bc_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {}, {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_BC_join_key, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_on_bc_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_c(df): df.rename(columns=field_names_map_c, inplace=True) return df project_c = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c.iteritems() ], 'project_c' + '_{}'.format(p), query_plan, False, project_fn_c)), range(0, settings.table_C_parts)) scan_c_detail_on_c_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_detail_field_name, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_primary_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_c_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c_detail = OrderedDict([ ('_0', settings.table_C_primary_key), ('_1', settings.table_C_detail_field_name) ]) def project_fn_c_detail(df): df.rename(columns=field_names_map_c_detail, inplace=True) return df project_c_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c_detail.iteritems() ], 'project_c_detail' + '_{}'.format(p), query_plan, False, project_fn_c_detail)), range(0, settings.table_C_parts)) map_bc_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_c_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_C_primary_key, 'bloom_create_bc_b_to_c_join_key_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_C_parts)) join_build_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_B_BC_join_key, 'join_build_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_build_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_C_primary_key, 'join_build_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_C_primary_key, settings. table_C_primary_key), 'join_probe_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_bc_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_bc_c_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_b(df): df.rename(columns=field_names_map_b, inplace=True) return df project_b = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b.iteritems() ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b) ), range(0, settings.table_B_parts)) map_ab_a_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_A_parts)) map_ab_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_A_AB_join_key, 'join_build_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_B_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_a) map(lambda o: o.set_async(False), project_b) map(lambda o: o.set_async(False), map_ab_a_join_key) map(lambda o: o.set_async(False), map_ab_b_join_key) if settings.table_C_key is None: map(lambda o: o.set_async(False), map_b_pk_1) map(lambda o: o.set_async(False), map_b_pk_2) map(lambda o: o.set_async(False), project_b_detail) else: map(lambda o: o.set_async(False), map_bc_b_join_key) map(lambda o: o.set_async(False), map_bc_c_join_key) map(lambda o: o.set_async(False), map_c_pk_1) map(lambda o: o.set_async(False), map_c_pk_2) map(lambda o: o.set_async(False), project_c) map(lambda o: o.set_async(False), project_c_detail) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_a, project_a) connect_many_to_many(project_a, map_ab_a_join_key) connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key) connect_all_to_all(project_a, bloom_create_ab_join_key) # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key) connect_many_to_many(join_build_a_and_b_on_ab_join_key, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key) connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key) connect_many_to_many(scan_b_on_ab_join_key, project_b) # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key) connect_many_to_many(project_b, map_ab_b_join_key) connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B) if settings.table_C_key is None: # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate) connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail) connect_many_to_many(project_b_detail, map_b_pk_2) connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk) connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_b_pk) connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate) connect_many_to_many(join_build_ab_and_b_on_b_pk, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1) connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk) else: connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_bc_join_key) connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key) connect_many_to_many(scan_c_on_bc_join_key, project_c) # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_bc_c_join_key, join_probe_ab_and_c_on_bc_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bc_b_join_key) connect_all_to_all(map_bc_b_join_key, join_build_ab_and_c_on_bc_join_key) connect_all_to_all(join_probe_ab_and_c_on_bc_join_key, bloom_create_c_pk) # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1) connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk) connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk) # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk) connect_many_to_many(join_build_abc_and_c_on_c_pk, join_probe_abc_and_c_on_c_pk) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C) # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail) # connect_many_to_many(project_c_detail, map_C_to_C) # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk) connect_many_to_many(project_c_detail, map_c_pk_2) connect_many_to_many(project_c, map_bc_c_join_key) connect_many_to_many(join_build_ab_and_c_on_bc_join_key, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk) connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_operator_def( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str( p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_filter = map(lambda p: # query_plan.add_operator( # tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), # range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() sum2_ = df['_1'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) connect_all_to_all(part_map, join_build) connect_many_to_many(join_build, join_probe) connect_many_to_many(lineitem_filter, lineitem_map) connect_all_to_all(lineitem_map, join_probe) connect_many_to_many(join_probe, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, queried_aliases, castable_aliases, select_str, aggregate_column, filter_str, path, format_=Format.PARQUET): secure = False use_native = False print('') print("Parquet Aggregate Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.{}.parquet".format(path, p ), "select {} from S3Object " "{};".format(select_str, filter_str), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_aliases df[castable_aliases] = df[castable_aliases].astype(np.double) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): if aggregate_column in df: return pd.DataFrame({'sum': [sum(df[aggregate_column])]}) else: return pd.DataFrame({'count': len(df)}, index=[0]) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, select_columns, filterOutNothing, path, format_=Format.PARQUET): secure = False use_native = False print('') print("Parquet Filter+Project All or None Benchmark Vary Columns") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file if filterOutNothing: # filter out nothing scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/lineitem.{}.parquet".format(path, p), "select {} from S3Object;".format(select_columns), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) else: scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/lineitem.{}.parquet".format(path, p), "select {} from S3Object where l_orderkey < 1;".format(select_columns), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_columns return df project = map(lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()