def lineitem_filter_def(name, query_plan): def pd_expr(df): return ((df['l_quantity'].astype(np.int) >= 3) & (df['l_quantity'].astype(np.int) <= 3 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | ( (df['l_quantity'].astype(np.int) >= 16) & (df['l_quantity'].astype(np.int) <= 16 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | ( (df['l_quantity'].astype(np.int) >= 24) & (df['l_quantity'].astype(np.int) <= 24 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) return Filter( PredicateExpression( lambda t_: (3 <= int(t_['l_quantity']) <= 3 + 10 and t_['l_shipmode'] in [ 'AIR', 'AIR REG' ] and t_['l_shipinstruct'] == 'DELIVER IN PERSON') or (16 <= int(t_['l_quantity']) <= 16 + 10 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or (24 <= int(t_['l_quantity']) <= 24 + 10 and t_[ 'l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON'), pd_expr), name, query_plan, False)
def part_filter_def(name, query_plan): def pd_expr(df): return ((df['p_brand'] == 'Brand#11') & (df['p_container'].isin( ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 5)) | ( (df['p_brand'] == 'Brand#44') & (df['p_container'].isin( ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 10)) | ( (df['p_brand'] == 'Brand#53') & (df['p_container'].isin( ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 15)) return Filter( PredicateExpression( lambda t_: (t_['p_brand'] == 'Brand#11' and t_['p_container'] in [ 'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG' ] and 1 <= int(t_['p_size']) <= 5) or (t_['p_brand'] == 'Brand#44' and t_['p_container'] in [ 'MED BAG', 'MED BOX', 'MED PACK', 'MED PKG' ] and 1 <= int(t_['p_size']) <= 10) or (t_['p_brand'] == 'Brand#53' and t_['p_container'] in [ 'LG CASE', 'LG BOX', 'LG PACK', 'LG PKG' ] and 1 <= int(t_['p_size']) <= 15), pd_expr), name, query_plan, False)
def filter_def(name, query_plan): def pd_expr(df): return ( (df['p_brand'] == 'Brand#11') & (df['p_container'].isin(['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) & (df['l_quantity'].astype(np.int) >= 3) & (df['l_quantity'].astype(np.int) <= 3 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 5) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) | ( (df['p_brand'] == 'Brand#44') & (df['p_container'].isin(['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) & (df['l_quantity'].astype(np.int) >= 16) & (df['l_quantity'].astype(np.int) <= 16 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) | ( (df['p_brand'] == 'Brand#53') & (df['p_container'].isin(['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) & (df['l_quantity'].astype(np.int) >= 24) & (df['l_quantity'].astype(np.int) <= 24 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 15) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) return Filter( PredicateExpression(lambda t_: ( t_['p_brand'] == 'Brand#11' and t_['p_container'] in ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'] and 3 <= int(t_['l_quantity']) <= 3 + 10 and 1 <= int(t_['p_size']) <= 5 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or ( t_['p_brand'] == 'Brand#44' and t_['p_container'] in ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'] and 16 <= int(t_['l_quantity']) <= 16 + 10 and 1 <= int(t_['p_size']) <= 10 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or ( t_['p_brand'] == 'Brand#53' and t_['p_container'] in ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'] and 24 <= int(t_['l_quantity']) <= 24 + 10 and 1 <= int(t_['p_size']) <= 15 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ), pd_expr), name, query_plan, False)
def filter_brand_container_op(name, query_plan): def pd_expr(df): return (df['p_brand'] == 'Brand#41') & (df['p_container'] == 'SM PACK') return Filter( PredicateExpression( lambda t_: t_['p_brand'] == 'Brand#41' and t_['p_container'] == 'SM PACK', pd_expr), name, query_plan, False)
def filter_brand12_operator_def(name, query_plan): # type: (str, QueryPlan) -> Filter def pd_expr(df): return df['p_brand'] == 'Brand#12' return Filter( PredicateExpression(lambda t_: t_['p_brand'] == 'Brand#12', pd_expr), name, query_plan, False)
def filter_shipdate_operator_def(max_shipped_date, name, query_plan): def pd_expr(df): return pd.to_datetime(df['l_shipdate'], cache=True) <= max_shipped_date return Filter( PredicateExpression(lambda t_: (cast(t_['l_shipdate'], timestamp) <= cast(max_shipped_date, timestamp)), pd_expr), name, query_plan, False)
def filter_shipdate_operator_def(min_shipped_date, max_shipped_date, name, query_plan): def pd_expr(df): # df['_10'] = pd.to_datetime(df['_10']) return (pd.to_datetime(df['l_shipdate']) >= min_shipped_date) & ( pd.to_datetime(df['l_shipdate']) < max_shipped_date) return Filter( PredicateExpression( lambda t_: (cast(t_['l_shipdate'], timestamp) >= cast( min_shipped_date, timestamp)) and (cast(t_['l_shipdate'], timestamp) < cast( max_shipped_date, timestamp)), pd_expr), name, query_plan, False)
def filter_lineitem_quantity_op(name, query_plan): """with filter_join_2 as (select * from part_lineitem_join_avg_group_join where l_quantity < avg_l_quantity_computed00) :param query_plan: :param name: :return: """ def pd_expr(df): return (df['l_quantity'].astype(np.float) < df['avg_l_quantity_computed00']) return Filter( PredicateExpression( lambda t_: float(t_['l_quantity']) < t_['avg_l_quantity_computed00' ], pd_expr), name, query_plan, False, )
def test_filter_empty(): """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) f = query_plan.add_operator( Filter( PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast( '1996-03-01', timestamp)), 'f', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def test_filter_baseline(): """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ''' ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False)) ''' # using a 'use_native=True' argument will result in a None object being returned ts = query_plan.add_operator( SQLTableScan('random_strings_2.csv', 'select * from S3Object limit 3;', Format.CSV, True, False, False, 'ts', query_plan, False)) f = query_plan.add_operator( Filter( PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast( '1996-03-01', timestamp)), 'f', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) print(len(c.tuples())) assert 2 + 1 == len(c.tuples()) field_names = [ '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15' ] assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O', '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK', 'egular courts above the' ] assert c.tuples()[2] == [ '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O', '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL', 'ly final dependencies: slyly bold ' ] # Write the metrics query_plan.print_metrics()
def test_pandas_filter_baseline(): """ :return: """ query_plan = QueryPlan(buffer_size=8192) # Query plan ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;', True, 'ts', query_plan, False)) # f = query_plan.add_operator( # Filter(PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast('1996-03-01', timestamp)), # 'f', query_plan, # False)) def pd_expr(df): # df['_10'] = pd.to_datetime(df['_10']) return pd.to_datetime(df['_10']) >= '1996-03-01' f = query_plan.add_operator( Filter(PredicateExpression(None, pd_expr), 'f', query_plan, True)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert 2 + 1 == len(c.tuples()) field_names = [ '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15' ] assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O', '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK', 'egular courts above the' ] assert c.tuples()[2] == [ '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O', '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL', 'ly final dependencies: slyly bold ' ] # Write the metrics query_plan.print_metrics()
def query_plan(settings): # type: (SyntheticBaselineJoinSettings) -> QueryPlan """ :type settings: :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) if settings.table_A_filter_fn is not None: filter_A = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_A_filter_fn), 'filter_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_B_filter_fn is not None: filter_b = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) filter_c = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) map_B_to_C = map(lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map(lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map(lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map(lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings.table_C_BC_join_key), 'join_probe_AB_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map(lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map(lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map(lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression(settings.table_A_AB_join_key, settings.table_B_AB_join_key), 'join_probe_A_B_{}'.format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map(lambda p: query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t[settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map(lambda p: query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t[settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator(Project( [ ProjectExpression(lambda t: t['_0'], 'total_balance') ], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) if settings.table_A_filter_fn is not None: map(lambda o: o.set_async(False), filter_A) map(lambda o: o.set_async(False), project_B) if settings.table_B_filter_fn is not None: map(lambda o: o.set_async(False), filter_b) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), filter_c) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) if settings.table_A_filter_fn is not None: connect_many_to_many(project_A, filter_A) connect_many_to_many(filter_A, map_A_to_B) else: connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) if settings.table_B_filter_fn is not None: connect_many_to_many(project_B, filter_b) connect_many_to_many(filter_b, map_B_to_B) else: connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, filter_c) connect_many_to_many(filter_c, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Baseline Join") print("---------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('customer', customer_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=customer_filter_sql is None) ), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0', '_6'], axis=1) df.rename(columns={ '_0': 'c_custkey', '_6': 'c_mktsegment' }, inplace=True) return df customer_project = map( lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) def customer_filter_fn(df): return df['c_mktsegment'] == 'BUILDING' customer_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=customer_filter_fn), 'customer_filter' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) customer_map = map( lambda p: query_plan.add_operator( Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False) ), range(0, customer_parts)) order_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('orders', order_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=order_filter_sql is None) ), format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_4', '_7'], axis=1) df.rename(columns={ '_1': 'o_custkey', '_0': 'o_orderkey', '_4': 'o_orderdate', '_7': 'o_shippriority' }, inplace=True) return df order_project = map( lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) def order_filter_fn(df): return pd.to_datetime(df['o_orderdate']) < pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) order_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=order_filter_fn), 'order_filter' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) order_map_1 = map( lambda p: query_plan.add_operator( Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', lineitem_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + lineitem_filter_sql if lineitem_filter_sql is not None else "", get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=lineitem_filter_sql is None) ), format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_5', '_6', '_10'], axis=1) df.rename(columns={ '_0': 'l_orderkey', '_5': 'l_extendedprice', '_6': 'l_discount', '_10': 'l_shipdate' }, inplace=True) return df lineitem_project = map( lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) def lineitem_filter_fn(df): return pd.to_datetime(df['l_shipdate']) > pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) lineitem_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=lineitem_filter_fn), 'lineitem_filter' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) order_map_2 = map( lambda p: query_plan.add_operator( Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False) ), range(0, other_parts)) customer_order_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map( lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_[ 'l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [ SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC') ], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), customer_filter) map(lambda o: o.set_async(False), order_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_filter) connect_many_to_many(customer_filter, customer_map) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_filter) connect_many_to_many(order_filter, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(lineitem_filter, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)