def sql_scan_lineitem_select_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('lineitem', sharded, shard, sf, format_), "select " " l_partkey, " " l_quantity, " " l_extendedprice, " " l_discount, " " l_shipinstruct, " " l_shipmode " "from " " S3Object " "where " " ( " " ( " " cast(l_quantity as integer) >= 3 and cast(l_quantity as integer) <= 3 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 16 and cast(l_quantity as integer) <= 16 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 24 and cast(l_quantity as integer) <= 24 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " ) {}".format(get_sql_suffix('lineitem', num_shards, shard, sharded)), format_, use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_part_partkey_brand_size_container_where_extra_filtered_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScan(get_file_key('part', sharded, shard, sf), "select " " p_partkey, " " p_brand, " " p_size, " " p_container " "from " " S3Object " "where " " (" " p_partkey = '103853' or " " p_partkey = '104277' or " " p_partkey = '104744' " " ) and " " ( " " ( " " p_brand = 'Brand#11' " " and p_container in (" " 'SM CASE', " " 'SM BOX', " " 'SM PACK', " " 'SM PKG'" " ) " " and cast(p_size as integer) between 1 and 5 " " ) " " or " " ( " " p_brand = 'Brand#44' " " and p_container in (" " 'MED BAG', " " 'MED BOX', " " 'MED PKG', " " 'MED PACK'" " ) " " and cast(p_size as integer) between 1 and 10 " " ) " " or " " ( " " p_brand = 'Brand#53' " " and p_container in (" " 'LG CASE', " " 'LG BOX', " " 'LG PACK', " " 'LG PKG'" " ) " " and cast(p_size as integer) between 1 and 15 " " ) " " ) {}" .format(get_sql_suffix('part', num_shards, shard, sharded)), use_pandas, secure, use_native, name, query_plan, False)
def sql_filtered_scan_lineitem_operator_def(max_shipped_date, sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_): return SQLTableScan(get_file_key('lineitem', sharded, shard, sf), " select l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate " " from S3Object " " where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp) " ";".format(max_shipped_date.strftime('%Y-%m-%d')), format_, use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_part_select_all_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('part', sharded, shard, sf, format_), "select " " * " "from " " S3Object " " {} ".format( get_sql_suffix('part', num_shards, shard, sharded, add_where=True)), format_, use_pandas, secure, use_native, name, query_plan, False)
def bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_extra_filtered_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScanBloomUse(get_file_key('lineitem', sharded, shard, sf), "select " " l_partkey, " " l_quantity, " " l_extendedprice, " " l_discount, " " l_shipinstruct, " " l_shipmode " "from " " S3Object " "where " " (" " l_partkey = '103853' or " " l_partkey = '104277' or " " l_partkey = '104744' " " ) and " " ( " " ( " " cast(l_quantity as integer) >= 3 " " and cast(l_quantity as integer) <= 3 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 16 " " and cast(l_quantity as integer) <= 16 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 24 " " and cast(l_quantity as integer) <= 24 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " ) {}" .format(get_sql_suffix('lineitem', num_shards, shard, sharded)), 'l_partkey', use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_part_select_all_where_partkey_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScan( get_file_key('part', sharded, shard, sf), "select " " * " "from " " S3Object " "where " " (" " p_partkey = '103853' or " " p_partkey = '104277' or " " p_partkey = '104744'" " ) {}".format(get_sql_suffix('part', num_shards, shard, sharded)), use_pandas, secure, use_native, name, query_plan, False)
def bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScanBloomUse( get_file_key('lineitem', sharded, shard, sf), "select " " l_orderkey, l_partkey, l_quantity, l_extendedprice " "from " " S3Object " "where " " l_partkey in ('181726', '182405') " " {}".format( get_sql_suffix('lineitem', num_shards, shard, sharded, add_where=False)), 'l_partkey', use_pandas, secure, use_native, name, query_plan, False)
def main(): parts = 32 query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p), "select * from S3Object;", Format.CSV, use_pandas=True, secure=False, use_native=False, name='scan_' + str(p), query_plan=query_plan, log_enabled=False)), range(0, parts)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan)) query_plan.execute()
def sql_scan_part_select_all_where_brand_and_container_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): """with part_scan as (select * from part) :param query_plan: :param name: :return: """ return SQLTableScan( get_file_key('part', sharded, shard, sf), "select " " * " "from " " S3Object " "where " " p_brand = 'Brand#41' and " " p_container = 'SM PACK' " " {} ".format( get_sql_suffix('part', num_shards, shard, sharded, add_where=False)), use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_lineitem_select_orderkey_partkey_quantity_extendedprice( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): """with lineitem_scan as (select * from lineitem) :param query_plan: :param name: :return: """ return SQLTableScan( get_file_key('lineitem', sharded, shard, sf, format_), "select " " l_orderkey, l_partkey, l_quantity, l_extendedprice " "from " " S3Object " " {} ".format( get_sql_suffix('lineitem', num_shards, shard, sharded, add_where=True)), format_, use_pandas, secure, use_native, name, query_plan, False)
def bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScanBloomUse( get_file_key('lineitem', sharded, shard, sf, format_), "select " " l_orderkey, l_partkey, l_quantity, l_extendedprice " "from " " S3Object " " {}".format( get_sql_suffix('lineitem', num_shards, shard, sharded, add_where=True)), 'l_partkey', format_=format_, use_pandas=use_pandas, secure=secure, use_native=use_native, name=name, query_plan=query_plan, log_enabled=False)
def sql_scan_lineitem_select_all_where_partkey_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): """with lineitem_scan as (select * from lineitem) :param query_plan: :param name: :return: """ return SQLTableScan( get_file_key('lineitem', sharded, shard, sf), "select " " * " "from " " S3Object " "where " " l_partkey in ('181726', '182405') " " {}".format( get_sql_suffix('lineitem', num_shards, shard, sharded, add_where=False)), use_pandas, secure, use_native, name, query_plan, False)
def query_plan(settings): # type: (SyntheticSemiJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_a = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_A_AB_join_key, settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_a' + '_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_a = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_a(df): df = df.rename(columns=field_names_map_a, copy=False) return df project_a = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_a.iteritems() ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a) ), range(0, settings.table_A_parts)) bloom_create_ab_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_ab_join_key' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_A_parts)) scan_b_on_ab_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_BC_join_key, settings.table_B_AB_join_key, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_AB_join_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_b_on_ab_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: scan_b_detail_on_b_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_primary_key, settings.table_B_detail_field_name, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_primary_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_b_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b_detail = OrderedDict([ ('_0', settings.table_B_primary_key), ('_1', settings.table_B_detail_field_name) ]) def project_fn_b_detail(df): df.rename(columns=field_names_map_b_detail, inplace=True) return df project_b_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b_detail.iteritems() ], 'project_b_detail' + '_{}'.format(p), query_plan, False, project_fn_b_detail)), range(0, settings.table_B_parts)) map_b_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_b_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_b_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_primary_key, 'bloom_create_b_pk' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_B_parts)) join_probe_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_primary_key, settings. table_B_primary_key), 'join_probe_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_build_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_primary_key, 'join_build_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) else: scan_c_on_bc_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {}, {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_BC_join_key, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_on_bc_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_c(df): df.rename(columns=field_names_map_c, inplace=True) return df project_c = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c.iteritems() ], 'project_c' + '_{}'.format(p), query_plan, False, project_fn_c)), range(0, settings.table_C_parts)) scan_c_detail_on_c_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_detail_field_name, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_primary_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_c_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c_detail = OrderedDict([ ('_0', settings.table_C_primary_key), ('_1', settings.table_C_detail_field_name) ]) def project_fn_c_detail(df): df.rename(columns=field_names_map_c_detail, inplace=True) return df project_c_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c_detail.iteritems() ], 'project_c_detail' + '_{}'.format(p), query_plan, False, project_fn_c_detail)), range(0, settings.table_C_parts)) map_bc_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_c_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_C_primary_key, 'bloom_create_bc_b_to_c_join_key_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_C_parts)) join_build_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_B_BC_join_key, 'join_build_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_build_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_C_primary_key, 'join_build_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_C_primary_key, settings. table_C_primary_key), 'join_probe_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_bc_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_bc_c_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_b(df): df.rename(columns=field_names_map_b, inplace=True) return df project_b = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b.iteritems() ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b) ), range(0, settings.table_B_parts)) map_ab_a_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_A_parts)) map_ab_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_A_AB_join_key, 'join_build_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_B_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_a) map(lambda o: o.set_async(False), project_b) map(lambda o: o.set_async(False), map_ab_a_join_key) map(lambda o: o.set_async(False), map_ab_b_join_key) if settings.table_C_key is None: map(lambda o: o.set_async(False), map_b_pk_1) map(lambda o: o.set_async(False), map_b_pk_2) map(lambda o: o.set_async(False), project_b_detail) else: map(lambda o: o.set_async(False), map_bc_b_join_key) map(lambda o: o.set_async(False), map_bc_c_join_key) map(lambda o: o.set_async(False), map_c_pk_1) map(lambda o: o.set_async(False), map_c_pk_2) map(lambda o: o.set_async(False), project_c) map(lambda o: o.set_async(False), project_c_detail) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_a, project_a) connect_many_to_many(project_a, map_ab_a_join_key) connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key) connect_all_to_all(project_a, bloom_create_ab_join_key) # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key) connect_many_to_many(join_build_a_and_b_on_ab_join_key, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key) connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key) connect_many_to_many(scan_b_on_ab_join_key, project_b) # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key) connect_many_to_many(project_b, map_ab_b_join_key) connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B) if settings.table_C_key is None: # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate) connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail) connect_many_to_many(project_b_detail, map_b_pk_2) connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk) connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_b_pk) connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate) connect_many_to_many(join_build_ab_and_b_on_b_pk, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1) connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk) else: connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_bc_join_key) connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key) connect_many_to_many(scan_c_on_bc_join_key, project_c) # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_bc_c_join_key, join_probe_ab_and_c_on_bc_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bc_b_join_key) connect_all_to_all(map_bc_b_join_key, join_build_ab_and_c_on_bc_join_key) connect_all_to_all(join_probe_ab_and_c_on_bc_join_key, bloom_create_c_pk) # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1) connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk) connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk) # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk) connect_many_to_many(join_build_abc_and_c_on_c_pk, join_probe_abc_and_c_on_c_pk) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C) # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail) # connect_many_to_many(project_c_detail, map_C_to_C) # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk) connect_many_to_many(project_c_detail, map_c_pk_2) connect_many_to_many(project_c, map_bc_c_join_key) connect_many_to_many(join_build_ab_and_c_on_bc_join_key, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk) connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) assert sf == 1 or sf == 10 # Scan Index Files index_scan = map( lambda p: query_plan.add_operator( SQLTableScan( 'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}' .format(sf, p if sf == 1 else p + 1), "select first_byte, last_byte " " from S3Object " " where col_values >= {} and col_values <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map( lambda p: query_plan.add_operator( TableRangeAccess( get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV), use_pandas, secure, use_native, 'range_access_{}'.format( p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) # Aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(aggregate), enumerate(range_access)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def query_plan(settings): # type: (SyntheticBaselineJoinSettings) -> QueryPlan """ :type settings: :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fun(df): df.columns = settings.table_A_field_names criterion = settings.table_A_filter_fn(df) df = df[criterion] return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fun)), range(0, settings.table_A_parts)) def scan_B_fun(df): df.columns = settings.table_B_field_names criterion = settings.table_B_filter_fn(df) return df[criterion] scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fun)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) filter_b = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) """ def scan_C_fun(df): df.columns = settings.table_C_field_names criterion = settings.table_C_filter_fn(df) return df[criterion] scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fun)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) filter_c = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def agg_fun(df): return pd.DataFrame({ 'sum': [df[settings.table_C_detail_field_name].astype(float).sum()] }) #return pd.DataFrame( { 'sum' : [ len(df) ] } ) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, agg_fun)), range(0, settings.table_C_parts)) def agg_reduce_fun(df): return pd.DataFrame({'sum': [df['sum'].sum()]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, agg_reduce_fun)) """ aggregate_project = query_plan.add_operator(Project( [ ProjectExpression(lambda t: t['_0'], 'total_balance') ], 'aggregate_project', query_plan, False)) """ collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(project_A, filter_A) #connect_many_to_many(filter_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(project_B, filter_b) #connect_many_to_many(filter_b, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) #connect_many_to_many(join_probe_A_B, part_aggregate) #connect_many_to_one(part_aggregate, aggregate_reduce) #connect_one_to_one(aggregate_reduce, collate) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(project_C, filter_c) #connect_many_to_many(filter_c, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, collate) #connect_one_to_one(aggregate_project, collate) return query_plan
def query_plan(settings): # type: (SyntheticFilteredJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_A_field_names), ' where {} '.format( settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '', get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=settings.table_A_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_A.iteritems() ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_B_field_names), ' where {} '.format( settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '', get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=settings.table_B_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_B.iteritems() ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_C.iteritems() ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) map(lambda o: o.set_async(False), project_B) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Bloom Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_), "select " " c_custkey " "from " " S3Object " "where " " c_mktsegment = 'BUILDING' " " {} " " {} " .format( ' and ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=False)), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'c_custkey'}, inplace=True) return df customer_project = map(lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) customer_bloom_create = query_plan.add_operator( BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False, fp_rate)) customer_map = map(lambda p: query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)), range(0, customer_parts)) order_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_), "select " " o_custkey, o_orderkey, o_orderdate, o_shippriority " "from " " S3Object " "where " " cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=False)), 'o_custkey', format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'}, inplace=True) return df order_project = map(lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) order_map_1 = map(lambda p: query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) order_bloom_create = query_plan.add_operator( BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False, fp_rate)) lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_), "select " " l_orderkey, l_extendedprice, l_discount " "from " " S3Object " "where " " cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '', get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=False)), 'l_orderkey', format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_1', '_2'], axis=1) df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'}, inplace=True) return df lineitem_project = map(lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) order_map_2 = map(lambda p: query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map(lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_map) connect_many_to_one(customer_project, customer_bloom_create) connect_one_to_many(customer_bloom_create, order_scan) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_one(order_project, order_bloom_create) connect_one_to_many(order_bloom_create, lineitem_scan) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', True, p, sf=sf, format_=format_ ), "select * from S3Object " "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};" .format(lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_), "select * from S3Object " "where l_extendedprice >= {} and l_extendedprice <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' # project def fn(df): df.columns = [ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def sql_scan_lineitem_operator_def(sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_): print(format_) return SQLTableScan(get_file_key('lineitem', sharded, shard, sf, format_), "select * from S3Object;", format_, use_pandas, secure, use_native, name, query_plan, False)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'] # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float) # criterion = (df['l_extendedprice'] >= lower) & ( # df['l_extendedprice'] <= upper) df[['_5']] = df[['_5']].astype(np.float) criterion = (df['_5'] >= lower) & (df['_5'] <= upper) return df[criterion] # scan the file scan_filter = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', True, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter)) aggregate.connect(collate) # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof")) # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof")) # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof")) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def query_plan(settings): # type: (SyntheticBloomJoinSettings) -> QueryPlan """ :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fn(df): df.columns = settings.table_A_field_names return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_A_field_names), settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fn)), range(0, settings.table_A_parts)) """ field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df.rename(columns=field_names_map_A, inplace=True) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, True, project_fn_A)), range(0, settings.table_A_parts)) """ bloom_create_a = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'. format(p), query_plan, False)), range(0, settings.table_A_parts)) def scan_B_fn(df): df.columns = settings.table_B_field_names return df scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_B_field_names), settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.table_B_AB_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fn)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) """ def scan_C_fn(df): df.columns = settings.table_C_field_names return df scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fn)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_ab = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(scan_A, project_A) #connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) #connect_many_to_many(project_A, bloom_create_a) connect_many_to_many(scan_A, bloom_create_a) connect_all_to_all(bloom_create_a, scan_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(scan_B, project_B) #connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) connect_many_to_many(join_probe_A_B, bloom_create_ab) connect_all_to_all(bloom_create_ab, scan_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(scan_C, project_C) #connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas, buffer_size, table_parts, lower, upper, sharded, sf, use_shared_mem, inline_ops, merge_ops): secure = False use_native = False if use_shared_mem: system = WorkerSystem(shared_mem_buffer_size) else: system = None query_plan = QueryPlan(system, is_async=parallel, buffer_size=buffer_size, use_shared_mem=use_shared_mem) # scan the file scan = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) def filter_fn(df): return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype( np.float) <= upper) filter_ = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p), query_plan, False)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda p: p.set_async(not inline_ops), filter_) connect_many_to_many(scan, filter_) connect_many_to_one(filter_, aggregate) connect_one_to_one(aggregate, collate) profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops, use_shared_mem) scan[0].set_profiled( True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + profile_file_suffix + ".prof")) filter_[0].set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filter_0" + profile_file_suffix + ".prof")) aggregate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + profile_file_suffix + ".prof")) collate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + profile_file_suffix + ".prof")) return query_plan