def test_project_perf(): """Executes a projection over many source rows to examine performance. :return: None """ num_rows = 10000 profile_file_name = os.path.join( ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof") query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan random_col_defs = [ RandomIntColumnDef(0, 9), RandomStringColumnDef(10, 20), RandomDateColumnDef(datetime.strptime('2017-01-01', '%Y-%m-%d'), datetime.strptime('2018-01-01', '%Y-%m-%d')) ] random_table_scan = query_plan.add_operator( RandomTableScan(num_rows, random_col_defs, 'random_table_scan', query_plan, False)) project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'r_0'), ProjectExpression(lambda t_: t_['_1'], 'r_1'), ProjectExpression(lambda t_: t_['_2'], 'r_2') ], 'project', query_plan, False)) project.set_profiled(True, profile_file_name) collate = query_plan.add_operator(Collate('collate', query_plan, False)) random_table_scan.connect(project) project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() query_plan.stop() # Write the metrics s = pstats.Stats(profile_file_name) s.strip_dirs().sort_stats("time").print_stats() assert len(tuples) == num_rows + 1
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, format_): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan Index Files upper = perc * 100 scan = map( lambda p: query_plan.add_operator( SQLTableScan( '{}/data_{}.csv'.format(path, p), "select * from S3Object " " where cast(F0 as float) < {};".format(upper), format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(scan)) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(use_pandas, secure, use_native, format_): profile_file_name = os.path.join(ROOT_DIR, "../benchmark-output/" + gen_test_id() + ".prof") os.remove(profile_file_name) if os.path.exists(profile_file_name) else None print("SQL Table Scan | Settings {}".format({'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native})) query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan scan = query_plan.add_operator( SQLTableScan('lineitem.csv', "select " " * " "from " " S3Object limit 100000", format_, use_pandas, secure, use_native, 'scan', query_plan, True)) # scan.set_profiled(True, profile_file_name) null = query_plan.add_operator( Null('null', query_plan, True)) scan.connect(null) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() query_plan.print_metrics() query_plan.stop() # Write the profile # s = pstats.Stats(profile_file_name) # s.strip_dirs().sort_stats("time").print_stats() print("SQL Table Scan | Done")
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_first_part, table_parts, queried_columns, select_columns, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling sample_scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format( path, p), 'select {} from S3Object;'.format( sort_field), format_, use_pandas, secure, use_native, 'column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Sampling project def project_fn1(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr = SortExpression(sort_field, float, sort_order) sample_topk = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(select_columns), #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False )) ' {} '.format(sort_field), 'sql_gen', query_plan, False)) # Scan scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Project def project_fn2(df): df.columns = queried_columns df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn2)), range(table_first_part, table_first_part + table_parts)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project)) sample_topk.connect(sql_gen) map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_lineitem_join_build_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_bloom_create = query_plan.add_operator( tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create', query_plan)) lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op( lineitem_sharded, p, part_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_lineitem_join_probe_map) map(lambda o: o.set_async(False), part_lineitem_join_build_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) connect_many_to_one(part_project, part_bloom_create) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # part_project.connect(part_bloom_create) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), # enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), # enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) connect_many_to_many(lineitem_project, part_lineitem_join_probe_map) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round( float(tuples[1][0]), 10 ) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run_head_table_sampling(stats, sort_field_index, sort_field, k, sample_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): secure = False use_native = False print('') print( "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}" .format(sort_field, sort_order, k, sample_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, 1 ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) sample_scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}.{}".format(shards_path, p), 'select {} from S3Object limit {};'.format( sort_field, per_part_samples), format_, use_pandas, secure, use_native, 'sample_scan_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Sampling project def project_fn(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda op: op.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run(parallel, buffer_size): """ :return: None """ print('') print("TPCH Q14 Filtered Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = query_plan.add_operator( tpch_q14. sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_scan', query_plan)) lineitem_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_project', query_plan)) part_scan = query_plan.add_operator( tpch_q14.sql_scan_part_partkey_type_part_where_brand12_operator_def( 'part_scan', query_plan)) part_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_project', query_plan)) join = query_plan.add_operator( tpch_q14.join_lineitem_part_operator_def('join', query_plan)) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) lineitem_scan.connect(lineitem_project) part_scan.connect(part_project) join.connect_left_producer(lineitem_project) join.connect_right_producer(part_project) join.connect(aggregate) aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert tuples[1] == [15.090116526324298]
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Bloom Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_), "select " " c_custkey " "from " " S3Object " "where " " c_mktsegment = 'BUILDING' " " {} " " {} " .format( ' and ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=False)), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'c_custkey'}, inplace=True) return df customer_project = map(lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) customer_bloom_create = query_plan.add_operator( BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False, fp_rate)) customer_map = map(lambda p: query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)), range(0, customer_parts)) order_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_), "select " " o_custkey, o_orderkey, o_orderdate, o_shippriority " "from " " S3Object " "where " " cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=False)), 'o_custkey', format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'}, inplace=True) return df order_project = map(lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) order_map_1 = map(lambda p: query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) order_bloom_create = query_plan.add_operator( BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False, fp_rate)) lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_), "select " " l_orderkey, l_extendedprice, l_discount " "from " " S3Object " "where " " cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '', get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=False)), 'l_orderkey', format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_1', '_2'], axis=1) df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'}, inplace=True) return df lineitem_project = map(lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) order_map_2 = map(lambda p: query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map(lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_map) connect_many_to_one(customer_project, customer_bloom_create) connect_one_to_many(customer_bloom_create, order_scan) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_one(order_project, order_bloom_create) connect_one_to_many(order_bloom_create, lineitem_scan) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(sf, parallel, use_pandas, secure, use_native, buffer_sizje, lineitem_parts, sharded, format_): """ :return: None """ print('') print("TPCH Q1 Baseline Group By") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q1.sql_scan_lineitem_operator_def( sharded, p, sf, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str(p), query_plan, format_)), range(0, lineitem_parts)) ''' lineitem_project = map(lambda p: query_plan.add_operator( tpch_q1.project_lineitem_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) date = '1998-12-01' max_shipped_date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=117) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q1.filter_shipdate_operator_def( max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) profile_file_name = os.path.join(ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof") groupby = map(lambda p: query_plan.add_operator( tpch_q1.groupby_returnflag_linestatus_operator_def( 'groupby' + '_' + str(p), query_plan)), range(0, lineitem_parts)) # groupby[0].set_profiled(True, profile_file_name) groupby_reduce = query_plan.add_operator( tpch_q1.groupby_reduce_returnflag_linestatus_operator_def( 'groupby_reduce', query_plan, use_pandas)) # lineitem_scan[0].set_profiled(True, # os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof")) # lineitem_project[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", # gen_test_id() + "_project_0" + ".prof")) # groupby[0].set_profiled(True, # os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_groupby_0" + ".prof")) # groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", # gen_test_id() + "_groupby_reduce" + ".prof")) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), lineitem_filter) # map(lambda o: o.set_async(False), groupby) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(groupby[p]), enumerate(lineitem_filter)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) ''' groupby = map( lambda p: query_plan.add_operator( tpch_q1.groupby_returnflag_linestatus_operator_def( 'groupby' + '_' + str(p), query_plan)), range(0, lineitem_parts)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (o): o.connect(collate), lineitem_scan) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_operator_def( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str( p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_filter = map(lambda p: # query_plan.add_operator( # tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), # range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() sum2_ = df['_1'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) connect_all_to_all(part_map, join_build) connect_many_to_many(join_build, join_probe) connect_many_to_many(lineitem_filter, lineitem_map) connect_all_to_all(lineitem_map, join_probe) connect_many_to_many(join_probe, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) ########################## ## Phase 1. Find out group names ########################## # Scan scan_phase1 = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields)), format_, use_pandas, secure, use_native, 'scan_phase1_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields return df project_exprs = [ ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields) ] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): return df.drop_duplicates() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) # GroupbyFilterBuild agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field)) for agg_field in agg_fields] groupby_filter_build = query_plan.add_operator( GroupbyFilterBuild(group_fields, agg_fields, agg_exprs, 'groupby_filter_build', query_plan, False)) ########################## ## Phase 2. Perform aggregation at S3. ########################## # Scan scan_phase2 = map( lambda p: query_plan.add_operator( SQLTableScan( 'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format( p), "", format_, use_pandas, secure, use_native, 'scan_phase2_{}'.format(p), query_plan, False)), range(0, table_parts)) groupby_decoder = map( lambda p: query_plan.add_operator( GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p), query_plan, False)), range(0, table_parts)) def groupby_fn_phase2(df): #print df df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby_reduce_phase2 = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False, groupby_fn_phase2)) #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof")) #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof")) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # phase 1 map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(groupby_filter_build) # phase 2 map(lambda (p, o): groupby_filter_build.connect(o, 0), enumerate(scan_phase2)) map(lambda (p, o): groupby_filter_build.connect(o, 1), enumerate(groupby_decoder)) map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2)) map(lambda (p, o): o.connect(groupby_reduce_phase2), enumerate(groupby_decoder)) # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder)) groupby_reduce_phase2.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/topk_data_{}.csv".format(path, p), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = ['F0', 'F1', 'F2'] df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # TopK sort_expr = SortExpression(sort_field, 'float', sort_order) topk = map(lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(0, table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/topk/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q14 Semi Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) part_scan_1 = query_plan.add_operator( tpch_q14.sql_scan_part_partkey_where_brand12_operator_def( False, 0, 1, use_pandas, 'part_table_scan_1', query_plan)) part_scan_1_project = query_plan.add_operator( tpch_q14.project_p_partkey_operator_def('part_scan_1_project', query_plan)) part_bloom_create = query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create', query_plan)) lineitem_scan_1 = query_plan.add_operator( tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def( min_shipped_date, max_shipped_date, False, 0, use_pandas, 'lineitem_scan_1', query_plan)) lineitem_scan_1_project = query_plan.add_operator( tpch_q14.project_l_partkey_operator_def('lineitem_scan_1_project', query_plan)) part_lineitem_join_1 = query_plan.add_operator( tpch_q14.join_part_lineitem_operator_def('part_lineitem_join_1', query_plan)) join_bloom_create = query_plan.add_operator( tpch_q14.bloom_create_l_partkey_operator_def('join_bloom_create', query_plan)) part_scan_2 = query_plan.add_operator( tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def( False, 0, 1, use_pandas, 'part_table_scan_2', query_plan)) part_scan_2_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_scan_2_project', query_plan)) lineitem_scan_2 = query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def( min_shipped_date, max_shipped_date, False, 0, use_pandas, 'lineitem_scan_2', query_plan)) lineitem_scan_2_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_scan_2_project', query_plan)) part_lineitem_join_2 = query_plan.add_operator( tpch_q14.join_part_lineitem_operator_def('part_lineitem_join_2', query_plan)) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) part_scan_1.connect(part_scan_1_project) part_scan_1_project.connect(part_bloom_create) part_bloom_create.connect(lineitem_scan_1) part_lineitem_join_1.connect_left_producer(part_scan_1_project) lineitem_scan_1.connect(lineitem_scan_1_project) part_lineitem_join_1.connect_right_producer(lineitem_scan_1_project) part_lineitem_join_1.connect(join_bloom_create) join_bloom_create.connect(part_scan_2) join_bloom_create.connect(lineitem_scan_2) part_scan_2.connect(part_scan_2_project) part_lineitem_join_2.connect_left_producer(part_scan_2_project) lineitem_scan_2.connect(lineitem_scan_2_project) part_lineitem_join_2.connect_right_producer(lineitem_scan_2_project) part_lineitem_join_2.connect(aggregate) aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert tuples[1] == [15.090116526324298]
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields + agg_fields)), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields + agg_fields return df project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \ + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set. date = '1996-03-13' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_extra_filtered_operator_def( lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(join_build)), enumerate(join_probe)) map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o), enumerate(lineitem_filter)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 33.4262326420
def topk_baseline(stats, k, sort_index='_5', col_type=float, col_name='l_extendedprice', sort_order='DESC', use_pandas=True, filtered=False, table_name='tpch-sf1/lineitem.csv', shards_prefix='tpch-sf1/lineitem_sharded', shards_start=0, shards_end=31): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ limit = k num_rows = 0 parallel_shards = True processes = multiprocessing.cpu_count() query_stats = [ 'baseline' if filtered is False else 'filtered', shards_prefix, col_name, sort_order, limit ] sql = 'select * from S3Object;' if filtered: sql = '''select l_extendedprice from S3Object;''' sort_index = '_0' print("\n\nBaseline TopK with order {} on field {}\n".format( sort_order, col_name)) query_plan = QueryPlan(is_async=True) # Query plan # ts = query_plan.add_operator( # SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False)) sort_exp = SortExpression(sort_index, col_type, sort_order) top_op = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk', query_plan, False)) for process in range(processes): proc_parts = [ x for x in range(shards_start, shards_end + 1) if x % processes == process ] pc = query_plan.add_operator( SQLShardedTableScan(table_name, sql, use_pandas, True, False, "topk_table_scan_parts_{}".format(proc_parts), proc_parts, shards_prefix, parallel_shards, query_plan, False)) # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts)) pc_top = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk_parts_{}'.format(proc_parts), query_plan, False)) pc.connect(pc_top) pc_top.connect(top_op) c = query_plan.add_operator(Collate('collate', query_plan, False)) top_op.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) # assert num_rows == limit + 1 cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] query_stats += [ 0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost, num_rows == limit + 1 ] stats.append(query_stats) # Write the metrics query_plan.print_metrics() query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map(lambda p: query_plan.add_operator(tpch_q17.filter_brand_container_op( 'part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q19 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_bloom_create = query_plan.add_operator( tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan, fp_rate)) part_scan = map( lambda p: query_plan.add_operator( tpch_q19. sql_scan_part_partkey_brand_size_container_where_filtered_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_project = \ map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_filtered_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_bloom_create_map = map(lambda p: # query_plan.add_operator( # Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)), # range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) filter_op = map( lambda p: query_plan.add_operator( tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, other_parts)) aggregate = map( lambda p: query_plan.add_operator( tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan, use_pandas)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), filter_op) map(lambda o: o.set_async(False), aggregate) aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # part_project.connect(part_bloom_create) connect_many_to_one(part_project, part_bloom_create) # part_bloom_create.connect(lineitem_bloom_use) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # lineitem_bloom_use.connect(lineitem_project) connect_many_to_many(lineitem_bloom_use, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_all_to_all(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', True, p, sf=sf, format_=format_ ), "select * from S3Object " "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};" .format(lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_), "select * from S3Object " "where l_extendedprice >= {} and l_extendedprice <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) ''' # project def fn(df): df.columns = [ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): df.columns = [ 'G0', 'G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', ] df = df.filter(items=group_fields + agg_fields, axis=1) return df # Scan scan = map( lambda p: query_plan.add_operator( TableScan(files.format(p), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) # inlined map(lambda p: p.set_async(False), groupby) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) connect_many_to_many(scan, groupby) connect_many_to_one(groupby, groupby_reduce) connect_one_to_one(groupby_reduce, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) assert sf == 1 or sf == 10 # Scan Index Files index_scan = map( lambda p: query_plan.add_operator( SQLTableScan( 'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}' .format(sf, p if sf == 1 else p + 1), "select first_byte, last_byte " " from S3Object " " where col_values >= {} and col_values <= {};".format( lower, upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map( lambda p: query_plan.add_operator( TableRangeAccess( get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV), use_pandas, secure, use_native, 'range_access_{}'.format( p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) # Aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(aggregate), enumerate(range_access)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, select_columns, lower, upper, path, format_=Format.CSV): secure = False use_native = False print('') print("CSV Filter+Project Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.tbl.{}".format(path, p ), "select {} from S3Object " "where cast(l_orderkey as int) >= {} and cast(l_orderkey as int) <= {};" .format(select_columns, lower, upper), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_columns df[['l_orderkey']] = df[['l_orderkey']].astype(np.int64) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run_local_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") stats += [ 'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scan = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda (i, op): sample_scan[i].set_nthreads(n_threads), enumerate(sample_scan)) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: tbl_smpler.connect(o), sample_scan) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts_end - table_parts_start)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_where_brand_and_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_where_partkey_op( lineitem_parts != 1, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_avg_group_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild( 'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, format_, shards_path): secure = False use_native = False print('') print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format( sort_field, sort_order, k)) print("----------------------") stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p ), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK sort_expr = [SortExpression(sort_field, float, sort_order)] topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() query_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ 0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run(parallel, use_pandas, buffer_size, selectivity1not0, typed, path, format_): secure = False use_native = False print('') print("CSV/Parquet Filter Return 1 Column Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) if selectivity1not0: if typed: # SQL scan the file scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where F0 >= 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: # SQL scan the file scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where cast(F0 as int) >= 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: # selectivity = 0 if typed: scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where F0 < 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) else: scan = map(lambda p: query_plan.add_operator( SQLTableScan(path, "select F0 from S3Object " "where cast(F0 as int) < 0;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, 1)) # project def fn(df): df.columns = ["F0"] return df project = map(lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, 1)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(1)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, nthreads=16, format_=Format.CSV): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan Index Files upper = perc * 100 index_scan = map(lambda p: query_plan.add_operator( SQLTableScan('{}/index/index_f0_{}.csv'.format(path, p), "select first_byte, last_byte " " from S3Object " " where name < {};".format(upper), format_, use_pandas, secure, use_native, 'index_scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Range accesses range_access = map(lambda p: query_plan.add_operator( TableRangeAccess('{}/data_{}.csv'.format(path, p), use_pandas, secure, use_native, 'range_access_{}'.format(p), query_plan, False)), range(0, table_parts)) map(lambda o: o.set_nthreads(nthreads), range_access) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan)) map(lambda (p, o): o.connect(collate), enumerate(range_access)) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded): """ :return: None """ print('') print("TPCH Q14 Semi Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) part_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_partkey_where_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_1' + '_' + str(p), query_plan)), range(0, part_parts)) part_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_p_partkey_operator_def( 'part_1_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def( 'part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_l_partkey_operator_def( 'lineitem_1_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_1_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_1_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_1_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) join_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_l_partkey_operator_def( 'join_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) part_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_2' + '_' + str(p), query_plan)), range(0, part_parts)) part_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_type_operator_def( 'part_2_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_2_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_2_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_2_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_2_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1)) map(lambda (p, o): o.connect(part_bloom_create[p]), enumerate(part_1_project)), map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(part_bloom_create)), enumerate(lineitem_scan_1)) map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]), enumerate(part_1_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_1_build)), enumerate(part_lineitem_join_1_probe)) map(lambda (p, o): o.connect(lineitem_1_project[p]), enumerate(lineitem_scan_1)) map( lambda (p, o): part_lineitem_join_1_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_1_project)) map(lambda (p, o): o.connect(join_bloom_create[p]), enumerate(part_lineitem_join_1_probe)) map( lambda (p, o): map(lambda (bp, bo): o.connect(bo), enumerate(part_scan_2)), enumerate(join_bloom_create)) map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(join_bloom_create)), enumerate(lineitem_scan_2)) map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2)) map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]), enumerate(part_2_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_2_build)), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(lineitem_2_project[p]), enumerate(lineitem_scan_2)) map( lambda (p, o): part_lineitem_join_2_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_2_project)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, queried_aliases, castable_aliases, select_str, aggregate_column, filter_str, path, format_=Format.PARQUET): secure = False use_native = False print('') print("Parquet Aggregate Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # SQL scan the file scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}/lineitem.{}.parquet".format(path, p ), "select {} from S3Object " "{};".format(select_str, filter_str), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # project def fn(df): df.columns = queried_aliases df[castable_aliases] = df[castable_aliases].astype(np.double) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(table_first_part, table_first_part + table_parts)) # aggregation def agg_fun(df): if aggregate_column in df: return pd.DataFrame({'sum': [sum(df[aggregate_column])]}) else: return pd.DataFrame({'count': len(df)}, index=[0]) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(aggregate), enumerate(project)) aggregate.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def topk_with_sampling(stats, k, k_scale=1, sort_index='_5', col_type=float, sort_field='l_extendedprice', sort_order='DESC', use_pandas=True, filtered=False, conservative=False, table_name='tpch-sf1/lineitem.csv', shards_prefix='tpch-sf1/lineitem_sharded', shards_start=0, shards_end=31): """ Executes the optimized topk query by firstly retrieving the first k tuples. Based on the retrieved tuples, table scan operator gets only the tuples larger/less than the most significant tuple in the sample :return: """ limit = k num_rows = 0 parallel_shards = True processes = multiprocessing.cpu_count() query_stats = [ 'sampling_{}_{}'.format( 'conservative' if conservative else 'aggressive', 'filtered' if filtered else 'non-filtered'), shards_prefix, sort_field, sort_order, limit * k_scale, limit * k_scale ] sql = 'select * from S3Object;' if filtered: sql = '''select l_extendedprice from S3Object;''' sort_index = '_0' print("\n\nSampling params:") print("Scale: {}, Sort Field: {}, Sort Order: {}\n".format( k_scale, sort_field, sort_order)) query_plan = QueryPlan(is_async=True) # Query plan ts = query_plan.add_operator( TopKTableScan( table_name, sql, use_pandas, True, False, limit, k_scale, SortExpression(sort_index, col_type, sort_order, sort_field), conservative, shards_start, shards_end, parallel_shards, shards_prefix, processes, 'topk_table_scan', query_plan, False)) c = query_plan.add_operator(Collate('collate', query_plan, False)) ts.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) # assert num_rows == limit + 1 cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] query_stats += [ ts.msv, ts.op_metrics.sampling_time, query_plan.total_elapsed_time, ts.op_metrics.sampling_time + query_plan.total_elapsed_time, # 0 if num_rows >= limit else 1, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost #, # num_rows == limit + 1 ] stats.append(query_stats) # Write the metrics query_plan.print_metrics() query_plan.stop() topk_op = query_plan.operators["topk_table_scan"] max_table_scan, _ = max([(op, op.op_metrics.elapsed_time()) for op in query_plan.operators.values()], key=lambda tup: tup[1]) OpMetrics.print_overall_metrics( [max_table_scan, topk_op, topk_op.sample_op], "TopKTableScan total time")