drill = "drill" # None compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if ((Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR): # Create Table Drill ------------------------------------------------ print("starting drill") from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) cs.init_drill_schema(drill, Settings.data["TestSettings"]["dataDirectory"]) # Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data["RunSettings"]["nRals"] main(dask_client, drill, Settings.data["TestSettings"]["dataDirectory"], bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: runTest.save_log() gpuMemory.print_log_gpu_memory()
Execution.getArgs() nvmlInit() drill = "drill" #None spark = "spark" compareResults = True if 'compare_results' in Settings.data['RunSettings']: compareResults = Settings.data['RunSettings']['compare_results'] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ------------------------------------------------------------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host = 'localhost', port = 8047) cs.init_drill_schema(drill, Settings.data['TestSettings']['dataDirectory']) # Create Table Spark ------------------------------------------------------------------------------------------------------ spark = SparkSession.builder.appName("timestampTest").getOrCreate() cs.init_spark_schema(spark, Settings.data['TestSettings']['dataDirectory']) #Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data['RunSettings']['nRals'] main(dask_client, drill, spark, Settings.data['TestSettings']['dataDirectory'], bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: runTest.save_log()
drill = "drill" # None compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if ((Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR): # Create Table Drill ------------------------------------------------ print("starting drill") from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) cs.init_drill_schema(drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) # Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data["RunSettings"]["nRals"] main(dask_client, drill, Settings.data["TestSettings"]["dataDirectory"], bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: runTest.save_log() gpuMemory.print_log_gpu_memory()
def main(): print("**init end2end**") Execution.getArgs() nvmlInit() dir_data_file = Settings.data["TestSettings"]["dataDirectory"] nRals = Settings.data["RunSettings"]["nRals"] drill = "drill" spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ----------------------------------------- from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) # Create Table Spark ------------------------------------------------- from pyspark.sql import SparkSession spark = SparkSession.builder.appName("allE2ETest").getOrCreate() createSchema.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"]) # Create Context For BlazingSQL bc, dask_client = init_context() targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"] runAllTests = ( len(targetTestGroups) == 0 ) # if targetTestGroups was empty the user wants to run all the tests if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups): aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("coalesceTest" in targetTestGroups): coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals) # we are not supporting coalesce yet if runAllTests or ("columnBasisTest" in targetTestGroups): columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("commonTableExpressionsTest" in targetTestGroups): commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc, nRals) # we are not supporting count distinct yet # countDistincTest.main(dask_client, drill, dir_data_file, bc) if runAllTests or ("countWithoutGroupByTest" in targetTestGroups): countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("dateTest" in targetTestGroups): dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("timestampTest" in targetTestGroups): timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("fullOuterJoinsTest" in targetTestGroups): fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("groupByTest" in targetTestGroups): groupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups): GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("innerJoinsTest" in targetTestGroups): innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("crossJoinsTest" in targetTestGroups): crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("" in targetTestGroups): leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("nonEquiJoinsTest" in targetTestGroups): nonEquiJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) # loadDataTest.main(dask_client, bc) #check this if runAllTests or ("nestedQueriesTest" in targetTestGroups): nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("orderbyTest" in targetTestGroups): orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("predicatesWithNulls" in targetTestGroups): predicatesWithNulls.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("stringTests" in targetTestGroups): stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("tablesFromPandasTest" in targetTestGroups): tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unaryOpsTest" in targetTestGroups): unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unifyTablesTest" in targetTestGroups): unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unionTest" in targetTestGroups): unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("useLimitTest" in targetTestGroups): useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("whereClauseTest" in targetTestGroups): whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("bindableAliasTest" in targetTestGroups): bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("booleanTest" in targetTestGroups): booleanTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("caseTest" in targetTestGroups): caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("castTest" in targetTestGroups): castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("concatTest" in targetTestGroups): concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("literalTest" in targetTestGroups): literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("dirTest" in targetTestGroups): dirTest.main(dask_client, drill, dir_data_file, bc, nRals) # HDFS is not working yet # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc) # HDFS is not working yet # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc) if runAllTests or ("likeTest" in targetTestGroups): likeTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("simpleDistributionTest" in targetTestGroups): simpleDistributionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("substringTest" in targetTestGroups): substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("wildCardTest" in targetTestGroups): wildCardTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("tpchQueriesTest" in targetTestGroups): tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("roundTest" in targetTestGroups): roundTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemLocalTest" in targetTestGroups): fileSystemLocalTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("messageValidationTest" in targetTestGroups): messageValidationTest.main(dask_client, drill, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("fileSystemS3Test" in targetTestGroups): fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemGSTest" in targetTestGroups): fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals) # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: result, error_msgs = runTest.save_log( Settings.execution_mode == ExecutionMode.GPUCI) max = 0 for i in range(0, len(Settings.memory_list)): if (Settings.memory_list[i].delta) > max: max = Settings.memory_list[i].delta print("MAX DELTA: " + str(max)) print("""*********************************************************** ********************""") for i in range(0, len(Settings.memory_list)): print(Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta)) return result, error_msgs return True, []
def main(dask_client, bc): # Create Table Drill ------------------------------------------------------------------------------------------------------ drill = PyDrill(host='localhost', port=8047) dir_data_lc = Settings.data['TestSettings']['dataDirectory'] for x in range(5): # [numberOfFiles, type_nation, type_region, type_supplier, type_customer, type_lineitem, type_orders] run = [] if x == 0: run = [1, 'psv', 'psv', 'psv', 'psv', 'psv', 'psv'] elif x == 1: run = [ 2, 'parquet', 'parquet', 'parquet', 'parquet', 'parquet', 'parquet' ] elif x == 2: run = [6, 'parquet', 'psv', 'parquet', 'psv', 'parquet', 'psv'] elif x == 3: run = [10, 'psv', 'parquet', 'psv', 'parquet', 'psv', 'parquet'] elif x == 4: run = [12, 'psv', 'psv', 'parquet', 'parquet', 'psv', 'parquet'] print( "=======================================================================" ) print("Running " + str(x + 1) + ":") print("Número de Archivos: " + str(run[0])) print("Type of files for Nation: " + run[1]) print("Type of files for Region: " + run[2]) print("Type of files for Supplier: " + run[3]) print("Type of files for Customer: " + run[4]) print("Type of files for Lineitem: " + run[5]) print("Type of files for Orders: " + run[6]) print( "=======================================================================" ) print("1") num_files = run[0] print("2") cs.init_drill_schema(drill, dir_data_lc, n_files=num_files) print("3") #Read Data TPCH------------------------------------------------------------------------------------------------------------ nation_files = cs.get_filenames_table('nation', dir_data_lc, num_files, run[1]) bc.create_table('nation', nation_files, delimiter='|', dtype=cs.get_dtypes('nation'), names=cs.get_column_names('nation')) region_files = cs.get_filenames_table('region', dir_data_lc, num_files, run[2]) bc.create_table('region', region_files, delimiter='|', dtype=cs.get_dtypes('region'), names=cs.get_column_names('region')) supplier_files = cs.get_filenames_table('supplier', dir_data_lc, num_files, run[3]) bc.create_table('supplier', supplier_files, delimiter='|', dtype=cs.get_dtypes('supplier'), names=cs.get_column_names('supplier')) customer_files = cs.get_filenames_table('customer', dir_data_lc, num_files, run[4]) bc.create_table('customer', customer_files, delimiter='|', dtype=cs.get_dtypes('customer'), names=cs.get_column_names('customer')) lineitem_files = cs.get_filenames_table('lineitem', dir_data_lc, num_files, run[5]) bc.create_table('lineitem', lineitem_files, delimiter='|', dtype=cs.get_dtypes('lineitem'), names=cs.get_column_names('lineitem')) orders_files = cs.get_filenames_table('orders', dir_data_lc, num_files, run[6]) bc.create_table('orders', orders_files, delimiter='|', dtype=cs.get_dtypes('orders'), names=cs.get_column_names('orders')) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 queryType = 'Load Data Test' print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select count(c_custkey) as c1, count(c_acctbal) as c2 from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select count(n_nationkey), count(n_regionkey) from nation" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select count(s_suppkey), count(s_nationkey) from supplier" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = "select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_05' query = "select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = "select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = "select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >=30" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = "select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = "select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = "select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_13' query = "select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_14' query = "select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey))/(sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_15' query = "select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_16' query = "select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' query = "select count(o_orderstatus) from orders where o_orderstatus <> 'O'" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_18' query = "select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_19' query = "select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_20' query = "select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_21' query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_22' query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_23' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_24' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) runTest.save_log()