def test_join_topk(): """Tests a top k with a join :return: None """ limit = 5 query_plan = QueryPlan() # Query plan ts1 = query_plan.add_operator(SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts1', query_plan, False)) ts1_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'ts1_project', query_plan, False)) ts2 = query_plan.add_operator(SQLTableScan('nation.csv', 'select * from S3Object;', False, 'ts2', query_plan, False)) ts2_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'ts2_project', query_plan, False)) j = query_plan.add_operator(HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'j', query_plan, False)) t = query_plan.add_operator(Limit(limit, 't', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts1.connect(ts1_project) ts2.connect(ts2_project) j.connect_left_producer(ts1_project) j.connect_right_producer(ts2_project) j.connect(t) t.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['s_nationkey', 'n_nationkey'] assert len(c.tuples()) == limit + 1 assert c.tuples()[0] == field_names num_rows = 0 for t in c.tuples(): num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['s_nationkey'] == lt['n_nationkey'] # Write the metrics query_plan.print_metrics()
def project_partkey_avg_quantity_op(name, query_plan): """with lineitem_part_avg_group_project as ( select l_partkey, 0.2 * avg(l_quantity) as l_quantity_computed00 from lineitem_part_avg_group ) :return: """ def fn(df): # return df[['_0', '_1', '_2']] df['avg_l_quantity_computed00'] = 0.2 * ( df['sum_l_quantity_computed00'].astype(np.float) / df['cnt_l_quantity_computed00'].astype(np.float)) df = df.filter(items=['l_partkey', 'avg_l_quantity_computed00'], axis=1) # df.rename(columns={'l_partkey': 'l_partkey', 'l_quantity': 'avg_l_quantity_computed00'}, # inplace=True) return df return Project( [ # l_partkey ProjectExpression(lambda t_: t_['_0'], 'l_partkey'), # 0.2 * avg ProjectExpression(lambda t_: 0.2 * t_['_1'], 'avg_l_quantity_computed00') ], name, query_plan, False, fn)
def project_lineitem_orderkey_partkey_quantity_extendedprice_op( name, query_plan): """with part_project as (select _0 as p_partkey from part_scan) :param query_plan: :param name: :return: """ def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_1', '_4', '_5'], axis=1) df.rename(columns={ '_0': 'l_orderkey', '_1': 'l_partkey', '_4': 'l_quantity', '_5': 'l_extendedprice' }, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_0'], 'l_orderkey'), ProjectExpression(lambda t_: t_['_1'], 'l_partkey'), ProjectExpression(lambda t_: t_['_4'], 'l_quantity'), ProjectExpression(lambda t_: t_['_5'], 'l_extendedprice') ], name, query_plan, False, fn)
def project_partkey_brand_container_op(name, query_plan): """with part_project as (select _0 as p_partkey from part_scan) :param query_plan: :param name: :return: """ def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_3', '_6'], axis=1) df.rename(columns={ '_0': 'p_partkey', '_3': 'p_brand', '_6': 'p_container' }, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_0'], 'p_partkey'), ProjectExpression(lambda t_: t_['_3'], 'p_brand'), ProjectExpression(lambda t_: t_['_6'], 'p_container') ], name, query_plan, False, fn)
def test_pandas_project_simple(): """Tests a projection :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'limit 3;', True, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['n_regionkey', 'n_nationkey', 'n_comment'] assert len(c.tuples()) == 3 + 1 assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '0', '0', ' haggle. carefully final deposits detect slyly agai' ] assert c.tuples()[2] == [ '1', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert c.tuples()[3] == [ '1', '2', 'y alongside of the pending deposits. carefully special packages ' 'are about the ironic forges. slyly special ' ] # Write the metrics query_plan.print_metrics()
def test_project_perf(): """Executes a projection over many source rows to examine performance. :return: None """ num_rows = 10000 profile_file_name = os.path.join( ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof") query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan random_col_defs = [ RandomIntColumnDef(0, 9), RandomStringColumnDef(10, 20), RandomDateColumnDef(datetime.strptime('2017-01-01', '%Y-%m-%d'), datetime.strptime('2018-01-01', '%Y-%m-%d')) ] random_table_scan = query_plan.add_operator( RandomTableScan(num_rows, random_col_defs, 'random_table_scan', query_plan, False)) project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'r_0'), ProjectExpression(lambda t_: t_['_1'], 'r_1'), ProjectExpression(lambda t_: t_['_2'], 'r_2') ], 'project', query_plan, False)) project.set_profiled(True, profile_file_name) collate = query_plan.add_operator(Collate('collate', query_plan, False)) random_table_scan.connect(project) project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() query_plan.stop() # Write the metrics s = pstats.Stats(profile_file_name) s.strip_dirs().sort_stats("time").print_stats() assert len(tuples) == num_rows + 1
def test_join_empty(): """Executes a join where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'supplier_scan', query_plan, False)) supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object limit 0;', False, 'nation_scan', query_plan, False)) nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False)) supplier_nation_join = query_plan.add_operator( HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'supplier_nation_join', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) supplier_nation_join.connect_left_producer(supplier_project) supplier_nation_join.connect_right_producer(nation_project) supplier_nation_join.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in collate.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(collate.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def project_partkey_type_operator_def(name, query_plan): # type: (str, QueryPlan) -> Project def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_1'], axis=1) df.rename(columns={'_0': 'p_partkey', '_1': 'p_type'}, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_0'], 'p_partkey'), ProjectExpression(lambda t_: t_['_1'], 'p_type') ], name, query_plan, False, fn)
def test_project_empty(): """Executes an projection query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from S3Object " "limit 0;", False, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def project_partkey_brand_size_container_filtered_op(name, query_plan): def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'p_partkey', '_1': 'p_brand', '_2': 'p_size', '_3': 'p_container'}, inplace=True) return df return Project( [ ProjectExpression(lambda t_: t_['_0'], 'p_partkey'), ProjectExpression(lambda t_: t_['_1'], 'p_brand'), ProjectExpression(lambda t_: t_['_2'], 'p_size'), ProjectExpression(lambda t_: t_['_3'], 'p_container') ], name, query_plan, False, fn)
def aggregate_project_def(name, query_plan): def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'revenue'}, inplace=True) return df return Project([ProjectExpression(lambda t_: t_['_0'], 'revenue')], name, query_plan, False, fn)
def project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op( name, query_plan): def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_1', '_2', '_3', '_4', '_5'], axis=1) df.rename(columns={ '_0': 'l_partkey', '_1': 'l_quantity', '_2': 'l_extendedprice', '_3': 'l_discount', '_4': 'l_shipinstruct', '_5': 'l_shipmode' }, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_0'], 'l_partkey'), ProjectExpression(lambda t_: t_['_1'], 'l_quantity'), ProjectExpression(lambda t_: t_['_2'], 'l_extendedprice'), ProjectExpression(lambda t_: t_['_3'], 'l_discount'), ProjectExpression(lambda t_: t_['_4'], 'l_shipinstruct'), ProjectExpression(lambda t_: t_['_5'], 'l_shipmode') ], name, query_plan, False, fn)
def project_orderkey_partkey_quantity_extendedprice_op(name, query_plan): def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={ '_0': 'l_orderkey', '_1': 'l_partkey', '_2': 'l_quantity', '_3': 'l_extendedprice' }, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_0'], 'l_orderkey'), ProjectExpression(lambda t_: t_['_1'], 'l_partkey'), ProjectExpression(lambda t_: t_['_2'], 'l_quantity'), ProjectExpression(lambda t_: t_['_3'], 'l_extendedprice') ], name, query_plan, False, fn)
def project_partkey_extendedprice_discount_shipdate_operator_def( name, query_plan): def fn(df): # return df[['_0', '_1', '_2']] df = df.filter(items=['_1', '_5', '_6', '_10'], axis=1) df.rename(columns={ '_1': 'l_partkey', '_5': 'l_extendedprice', '_6': 'l_discount', '_10': 'l_shipdate' }, inplace=True) return df return Project([ ProjectExpression(lambda t_: t_['_1'], 'l_partkey'), ProjectExpression(lambda t_: t_['_5'], 'l_extendedprice'), ProjectExpression(lambda t_: t_['_6'], 'l_discount'), ProjectExpression(lambda t_: t_['_10'], 'l_shipdate') ], name, query_plan, False, fn)
def project_lineitem_from_filtered_scan_operator_def(name, query_plan): def fn(df): df.rename(columns={'_0': 'l_quantity', '_1': 'l_extendedprice', '_2': 'l_discount', '_3': 'l_tax', '_4': 'l_returnflag', '_5': 'l_linestatus', '_6': 'l_shipdate'}, inplace=True) return df return Project( [ ProjectExpression(lambda t_: t_['_0'], 'l_quantity'), ProjectExpression(lambda t_: t_['_1'], 'l_extendedprice'), ProjectExpression(lambda t_: t_['_2'], 'l_discount'), ProjectExpression(lambda t_: t_['_3'], 'l_tax'), ProjectExpression(lambda t_: t_['_4'], 'l_returnflag'), ProjectExpression(lambda t_: t_['_5'], 'l_linestatus'), ProjectExpression(lambda t_: t_['_6'], 'l_shipdate'), ], name, query_plan, False, fn)
def project_avg_yearly_op(name, query_plan): """with extendedprice_sum_aggregate_project as ( select l_extendedprice / 7.0 as avg_yearly from extendedprice_sum_aggregate ) :return: """ def fn(df): # return df[['_0', '_1', '_2']] df['_0'] = df['_0'].astype(np.float) / 7.0 df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'avg_yearly'}, inplace=True) return df return Project( [ProjectExpression(lambda t_: t_['_0'] / 7.0, 'avg_yearly')], name, query_plan, False, fn)
def project_lineitem_operator_def(name, query_plan): # type: (str, QueryPlan) -> Project def fn(df): df = df.filter(items=['_4', '_5', '_6', '_7', '_8', '_9', '_10'], axis=1) df.rename(columns={'_4': 'l_quantity', '_5': 'l_extendedprice', '_6': 'l_discount', '_7': 'l_tax', '_8': 'l_returnflag', '_9': 'l_linestatus', '_10': 'l_shipdate'}, inplace=True) return df return Project( [ ProjectExpression(lambda t_: t_['_4'], 'l_quantity'), ProjectExpression(lambda t_: t_['_5'], 'l_extendedprice'), ProjectExpression(lambda t_: t_['_6'], 'l_discount'), ProjectExpression(lambda t_: t_['_7'], 'l_tax'), ProjectExpression(lambda t_: t_['_8'], 'l_returnflag'), ProjectExpression(lambda t_: t_['_9'], 'l_linestatus'), ProjectExpression(lambda t_: t_['_10'], 'l_shipdate'), ], name, query_plan, False, fn)
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, format_, shards_path): secure = False use_native = False print('') print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format( sort_field, sort_order, k)) print("----------------------") stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p ), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK sort_expr = [SortExpression(sort_field, float, sort_order)] topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() query_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ 0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_head_table_sampling(stats, sort_field_index, sort_field, k, sample_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): secure = False use_native = False print('') print( "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}" .format(sort_field, sort_order, k, sample_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, 1 ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) sample_scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}.{}".format(shards_path, p), 'select {} from S3Object limit {};'.format( sort_field, per_part_samples), format_, use_pandas, secure, use_native, 'sample_scan_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Sampling project def project_fn(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda op: op.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_local_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") stats += [ 'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scan = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda (i, op): sample_scan[i].set_nthreads(n_threads), enumerate(sample_scan)) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: tbl_smpler.connect(o), sample_scan) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts_end - table_parts_start)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_memory_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print( "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}" .format(sort_field, sort_order, k, sample_size, batch_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scanners = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda op: op.set_nthreads(n_threads), sample_scanners) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sample_sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda op: tbl_smpler.connect(op), sample_scanners) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scanners)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down # query_plan.stop() sampling_threshold = query_plan.retrieve_sampling_threshold() sampling_runtime = query_plan.get_phase_runtime('sampl') sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost( 'sampl') sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost( phase_keyword='sampl') sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost( 'sampl') total_runtime = query_plan.total_elapsed_time total_http_requests, total_requests_cost = query_plan.requests_cost() total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost( ) total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost() total_data_cost = query_plan.data_cost()[0] total_computation_cost = query_plan.computation_cost() total_cost = query_plan.cost()[0] stats += [ sampling_threshold, sampling_runtime, total_runtime - sampling_runtime, total_runtime, sampling_returned_rows, sampling_scanned_bytes * BYTE_TO_MB, sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests, sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost, total_returned_rows, total_scanned_bytes * BYTE_TO_MB, total_returned_bytes * BYTE_TO_MB, total_http_requests, total_requests_cost, total_transfer_cost, total_scan_cost, total_data_cost, total_computation_cost, total_cost ]
def query_plan(settings): # type: (SyntheticFilteredJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_A_field_names), ' where {} '.format( settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '', get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=settings.table_A_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_A.iteritems() ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_B_field_names), ' where {} '.format( settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '', get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=settings.table_B_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_B.iteritems() ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_C.iteritems() ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) map(lambda o: o.set_async(False), project_B) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) ########################## ## Phase 1. Find out group names ########################## # Scan scan_phase1 = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields)), format_, use_pandas, secure, use_native, 'scan_phase1_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields return df project_exprs = [ ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields) ] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): return df.drop_duplicates() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) # GroupbyFilterBuild agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field)) for agg_field in agg_fields] groupby_filter_build = query_plan.add_operator( GroupbyFilterBuild(group_fields, agg_fields, agg_exprs, 'groupby_filter_build', query_plan, False)) ########################## ## Phase 2. Perform aggregation at S3. ########################## # Scan scan_phase2 = map( lambda p: query_plan.add_operator( SQLTableScan( 'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format( p), "", format_, use_pandas, secure, use_native, 'scan_phase2_{}'.format(p), query_plan, False)), range(0, table_parts)) groupby_decoder = map( lambda p: query_plan.add_operator( GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p), query_plan, False)), range(0, table_parts)) def groupby_fn_phase2(df): #print df df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby_reduce_phase2 = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False, groupby_fn_phase2)) #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof")) #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof")) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # phase 1 map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(groupby_filter_build) # phase 2 map(lambda (p, o): groupby_filter_build.connect(o, 0), enumerate(scan_phase2)) map(lambda (p, o): groupby_filter_build.connect(o, 1), enumerate(groupby_decoder)) map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2)) map(lambda (p, o): o.connect(groupby_reduce_phase2), enumerate(groupby_decoder)) # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder)) groupby_reduce_phase2.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def test_operators(): system = WorkerSystem() query_plan = QueryPlan(system, is_async=True, buffer_size=0) # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'limit 3;', True, False, False, 'scan', query_plan, True)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_1'], 'n_name'), ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'project', query_plan, True)) c = query_plan.add_operator(Collate('collate', query_plan, True)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = c.tuples() c.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment'] assert len(tuples) == 3 + 1 assert tuples[0] == field_names assert tuples[1] == [ '0', 'ALGERIA', '0', ' haggle. carefully final deposits detect slyly agai' ] assert tuples[2] == [ '1', 'ARGENTINA', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert tuples[3] == [ '2', 'BRAZIL', '1', 'y alongside of the pending deposits. carefully special packages ' 'are about the ironic forges. slyly special ' ]
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields + agg_fields)), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields + agg_fields return df project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \ + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def project_p_partkey_operator_def(name, query_plan): return Project([ProjectExpression(lambda t_: t_['_0'], 'p_partkey')], name, query_plan, False)
def query_plan(settings): # type: (SyntheticSemiJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_a = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_A_AB_join_key, settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_a' + '_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_a = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_a(df): df = df.rename(columns=field_names_map_a, copy=False) return df project_a = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_a.iteritems() ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a) ), range(0, settings.table_A_parts)) bloom_create_ab_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_ab_join_key' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_A_parts)) scan_b_on_ab_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_BC_join_key, settings.table_B_AB_join_key, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_AB_join_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_b_on_ab_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: scan_b_detail_on_b_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_primary_key, settings.table_B_detail_field_name, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_primary_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_b_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b_detail = OrderedDict([ ('_0', settings.table_B_primary_key), ('_1', settings.table_B_detail_field_name) ]) def project_fn_b_detail(df): df.rename(columns=field_names_map_b_detail, inplace=True) return df project_b_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b_detail.iteritems() ], 'project_b_detail' + '_{}'.format(p), query_plan, False, project_fn_b_detail)), range(0, settings.table_B_parts)) map_b_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_b_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_b_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_primary_key, 'bloom_create_b_pk' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_B_parts)) join_probe_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_primary_key, settings. table_B_primary_key), 'join_probe_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_build_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_primary_key, 'join_build_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) else: scan_c_on_bc_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {}, {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_BC_join_key, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_on_bc_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_c(df): df.rename(columns=field_names_map_c, inplace=True) return df project_c = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c.iteritems() ], 'project_c' + '_{}'.format(p), query_plan, False, project_fn_c)), range(0, settings.table_C_parts)) scan_c_detail_on_c_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_detail_field_name, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_primary_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_c_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c_detail = OrderedDict([ ('_0', settings.table_C_primary_key), ('_1', settings.table_C_detail_field_name) ]) def project_fn_c_detail(df): df.rename(columns=field_names_map_c_detail, inplace=True) return df project_c_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c_detail.iteritems() ], 'project_c_detail' + '_{}'.format(p), query_plan, False, project_fn_c_detail)), range(0, settings.table_C_parts)) map_bc_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_c_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_C_primary_key, 'bloom_create_bc_b_to_c_join_key_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_C_parts)) join_build_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_B_BC_join_key, 'join_build_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_build_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_C_primary_key, 'join_build_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_C_primary_key, settings. table_C_primary_key), 'join_probe_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_bc_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_bc_c_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_b(df): df.rename(columns=field_names_map_b, inplace=True) return df project_b = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b.iteritems() ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b) ), range(0, settings.table_B_parts)) map_ab_a_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_A_parts)) map_ab_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_A_AB_join_key, 'join_build_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_B_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_a) map(lambda o: o.set_async(False), project_b) map(lambda o: o.set_async(False), map_ab_a_join_key) map(lambda o: o.set_async(False), map_ab_b_join_key) if settings.table_C_key is None: map(lambda o: o.set_async(False), map_b_pk_1) map(lambda o: o.set_async(False), map_b_pk_2) map(lambda o: o.set_async(False), project_b_detail) else: map(lambda o: o.set_async(False), map_bc_b_join_key) map(lambda o: o.set_async(False), map_bc_c_join_key) map(lambda o: o.set_async(False), map_c_pk_1) map(lambda o: o.set_async(False), map_c_pk_2) map(lambda o: o.set_async(False), project_c) map(lambda o: o.set_async(False), project_c_detail) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_a, project_a) connect_many_to_many(project_a, map_ab_a_join_key) connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key) connect_all_to_all(project_a, bloom_create_ab_join_key) # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key) connect_many_to_many(join_build_a_and_b_on_ab_join_key, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key) connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key) connect_many_to_many(scan_b_on_ab_join_key, project_b) # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key) connect_many_to_many(project_b, map_ab_b_join_key) connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B) if settings.table_C_key is None: # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate) connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail) connect_many_to_many(project_b_detail, map_b_pk_2) connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk) connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_b_pk) connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate) connect_many_to_many(join_build_ab_and_b_on_b_pk, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1) connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk) else: connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_bc_join_key) connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key) connect_many_to_many(scan_c_on_bc_join_key, project_c) # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_bc_c_join_key, join_probe_ab_and_c_on_bc_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bc_b_join_key) connect_all_to_all(map_bc_b_join_key, join_build_ab_and_c_on_bc_join_key) connect_all_to_all(join_probe_ab_and_c_on_bc_join_key, bloom_create_c_pk) # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1) connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk) connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk) # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk) connect_many_to_many(join_build_abc_and_c_on_c_pk, join_probe_abc_and_c_on_c_pk) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C) # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail) # connect_many_to_many(project_c_detail, map_C_to_C) # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk) connect_many_to_many(project_c_detail, map_c_pk_2) connect_many_to_many(project_c, map_bc_c_join_key) connect_many_to_many(join_build_ab_and_c_on_bc_join_key, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk) connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/topk_data_{}.csv".format(path, p), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = ['F0', 'F1', 'F2'] df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # TopK sort_expr = SortExpression(sort_field, 'float', sort_order) topk = map(lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(0, table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/topk/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_first_part, table_parts, queried_columns, select_columns, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling sample_scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format( path, p), 'select {} from S3Object;'.format( sort_field), format_, use_pandas, secure, use_native, 'column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Sampling project def project_fn1(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr = SortExpression(sort_field, float, sort_order) sample_topk = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(select_columns), #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False )) ' {} '.format(sort_field), 'sql_gen', query_plan, False)) # Scan scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Project def project_fn2(df): df.columns = queried_columns df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn2)), range(table_first_part, table_first_part + table_parts)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project)) sample_topk.connect(sql_gen) map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def query_plan(settings): # type: (SyntheticBloomJoinSettings) -> QueryPlan """ :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fn(df): df.columns = settings.table_A_field_names return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_A_field_names), settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fn)), range(0, settings.table_A_parts)) """ field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df.rename(columns=field_names_map_A, inplace=True) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, True, project_fn_A)), range(0, settings.table_A_parts)) """ bloom_create_a = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'. format(p), query_plan, False)), range(0, settings.table_A_parts)) def scan_B_fn(df): df.columns = settings.table_B_field_names return df scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_B_field_names), settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.table_B_AB_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fn)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) """ def scan_C_fn(df): df.columns = settings.table_C_field_names return df scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fn)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_ab = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(scan_A, project_A) #connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) #connect_many_to_many(project_A, bloom_create_a) connect_many_to_many(scan_A, bloom_create_a) connect_all_to_all(bloom_create_a, scan_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(scan_B, project_B) #connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) connect_many_to_many(join_probe_A_B, bloom_create_ab) connect_all_to_all(bloom_create_ab, scan_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(scan_C, project_C) #connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan