def cli(dbname, host, port, timeout, interactive, version): """Cli entry point.""" if version or dbname == 'version': print('datacli version: {}'.format(__version__)) sys.exit(0) configure_logger() conn = PyDrill(host=host, port=port) if not conn.is_active(): log.error('unable to reach Drill server') return 1 cli = DataCli(conn, dbname, DataPrompt(), timeout=timeout) log.info('connected to Drillbit') while True: try: should_exit = cli.repl(interactive) if should_exit: break except KeyboardInterrupt: break # Control-C pressed except EOFError: break # Control-D pressed log.info('shutting down...') return 0
def connection(self): from pydrill.client import PyDrill if self._connection is None: self._connection = PyDrill(**self.params) return self._connection
def test_authentication_success(pydrill_url): responses.add( **{ 'method': responses.POST, 'url': "{0}/{1}".format(pydrill_url, 'j_security_check'), }) PyDrill(auth='user:password')
def main(): print("**init end2end**") Execution.getArgs() dir_data_file = Settings.data["TestSettings"]["dataDirectory"] # Create Table Drill ----------------------------------------- drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema(drill, dir_data_file) # Sólo pasan todos los test con 100Mb csvFromLocalTest.main(drill, dir_data_file) csvFromS3Test.main( drill, dir_data_file ) # AttributeError: 'NoneType' object has no attribute '_cols' # vector::_M_range_check: __n # (which is 18446744073709551615) >= this->size() (which is 2) csvFromHdfsTest.main( drill, dir_data_file ) parquetFromLocalTest.main( drill, dir_data_file ) # Sólo pasan todos los test con 100Mb # Pasan todos los test con 100Mb, con multiples archivos para # una tabla no porque no se carga bien todos los archivos. parquetFromS3Test.main( drill, dir_data_file ) parquetFromHdfsTest.main( drill, dir_data_file ) # Se queda pensando en la lectura de data runTest.save_log() for i in range(0, len(Settings.memory_list)): print( Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta) )
def init_drill(): # Start Drill schema----------------------------------------- from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) return drill
def main(): print('**init performance test**') Execution.getArgs() dir_data_file = Settings.data['TestSettings']['dataDirectory'] # Create Table Drill ------------------------------------------------------------------------------------------------------ drill = PyDrill(host='localhost', port=8047) createSchema.init_drill_schema(drill, dir_data_file) jobId = 1 if Settings.data['MysqlConnection']['connectEnabled']: from DataBase import mysqlDatabaseManager as msqldb jobId = msqldb.getJobId() for x in range(0, 10): performanceTest.main(drill, dir_data_file) runTest.save_log(job_id=jobId)
def main(): print("**init end2end**") Execution.getArgs() nvmlInit() dir_data_file = Settings.data["TestSettings"]["dataDirectory"] nRals = Settings.data["RunSettings"]["nRals"] drill = "drill" spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ----------------------------------------- from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) # Create Table Spark ------------------------------------------------- from pyspark.sql import SparkSession spark = SparkSession.builder.appName("allE2ETest").getOrCreate() createSchema.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"]) createSchema.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"] # only innerJoinsTest will be with progress bar useProgressBar = False if "innerJoinsTest" in targetTestGroups: useProgressBar = True print("Using progress bar: ", useProgressBar) # Create Context For BlazingSQL bc, dask_client = init_context(useProgressBar=useProgressBar) runAllTests = ( len(targetTestGroups) == 0 ) # if targetTestGroups was empty the user wants to run all the tests if runAllTests or ("hiveFileTest" in targetTestGroups): hiveFileTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups): aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("coalesceTest" in targetTestGroups): coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("columnBasisTest" in targetTestGroups): columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("commonTableExpressionsTest" in targetTestGroups): commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("countDistinctTest" in targetTestGroups): countDistinctTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("countWithoutGroupByTest" in targetTestGroups): countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("dateTest" in targetTestGroups): dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("timestampTest" in targetTestGroups): timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("toTimestampTest" in targetTestGroups): toTimestampTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("dayOfWeekTest" in targetTestGroups): dayOfWeekTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("fullOuterJoinsTest" in targetTestGroups): fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("groupByTest" in targetTestGroups): groupByTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups): GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("innerJoinsTest" in targetTestGroups): innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("crossJoinsTest" in targetTestGroups): crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("leftOuterJoinsTest" in targetTestGroups): leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("nonEquiJoinsTest" in targetTestGroups): nonEquiJoinsTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) # loadDataTest.main(dask_client, bc) #check this if runAllTests or ("nestedQueriesTest" in targetTestGroups): nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("orderbyTest" in targetTestGroups): orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("predicatesWithNulls" in targetTestGroups): predicatesWithNulls.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("stringTests" in targetTestGroups): stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("tablesFromPandasTest" in targetTestGroups): tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unaryOpsTest" in targetTestGroups): unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unifyTablesTest" in targetTestGroups): unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unionTest" in targetTestGroups): unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("useLimitTest" in targetTestGroups): useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("whereClauseTest" in targetTestGroups): whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("bindableAliasTest" in targetTestGroups): bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("booleanTest" in targetTestGroups): booleanTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("caseTest" in targetTestGroups): caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("castTest" in targetTestGroups): castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("concatTest" in targetTestGroups): concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("literalTest" in targetTestGroups): literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("dirTest" in targetTestGroups): dirTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) # HDFS is not working yet # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc) # HDFS is not working yet # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc) if runAllTests or ("likeTest" in targetTestGroups): likeTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("substringTest" in targetTestGroups): substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("stringCaseTest" in targetTestGroups): stringCaseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("wildCardTest" in targetTestGroups): wildCardTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("tpchQueriesTest" in targetTestGroups): tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("roundTest" in targetTestGroups): roundTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemLocalTest" in targetTestGroups): fileSystemLocalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("messageValidationTest" in targetTestGroups): messageValidationTest.main(dask_client, drill, dir_data_file, bc, nRals) testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"] if testsWithNulls != "true": if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("fileSystemS3Test" in targetTestGroups): fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemGSTest" in targetTestGroups): fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("loggingTest" in targetTestGroups): loggingTest.main(dask_client, dir_data_file, bc, nRals) # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals) #TODO re enable this test once we have the new version of dask # https://github.com/dask/distributed/issues/4645 # https://github.com/rapidsai/cudf/issues/7773 #if runAllTests or ("smilesTest" in targetTestGroups): # smilesTest.main(dask_client, spark, dir_data_file, bc, nRals) if testsWithNulls != "true": if runAllTests or ("jsonTest" in targetTestGroups): jsonTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("windowFunctionTest" in targetTestGroups): windowFunctionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("windowNoPartitionTest" in targetTestGroups): windowNoPartitionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if testsWithNulls != "true": if runAllTests or ("concurrentTest" in targetTestGroups): concurrentTest.main(dask_client, drill, dir_data_file, bc, nRals) if testsWithNulls == "true": if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("tablesFromSQL" in targetTestGroups): tablesFromSQL.main(dask_client, drill, dir_data_file, bc, nRals) # WARNING!!! This Test must be the last one to test ------------------------------------------------------------------------------------------------------------------------------------------- if runAllTests or ("configOptionsTest" in targetTestGroups): configOptionsTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: result, error_msgs = runTest.save_log( Settings.execution_mode == ExecutionMode.GPUCI) max = 0 for i in range(0, len(Settings.memory_list)): if (Settings.memory_list[i].delta) > max: max = Settings.memory_list[i].delta print("MAX DELTA: " + str(max)) print("""*********************************************************** ********************""") for i in range(0, len(Settings.memory_list)): print(Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta)) return result, error_msgs return True, []
def get_column_types(query): drill = PyDrill(host='localhost', port=8049) data = drill.query(query) columns = data.columns types = {} formattedQuery = sqlparse.format(query, reindent=True, keyword_case='upper') formattedQuery = formattedQuery.split('\n') print(sqlparse.format(query, reindent=True, keyword_case='upper')) inSelect = False inSubquery = False inFromClause = False fields = [] fieldRegex = r'\s{7}\S' fieldSubquery = r'\s' subqueryFieldPattern = r'\s{2,3}\S' subqueryField = "" fromClause = "" functionPattern = r'\s+(\S+)\(' fieldCount = 0 for line in formattedQuery: functionMatchObject = re.match(functionPattern, line) if line.startswith('SELECT'): inSelect = True line = line.replace('SELECT', '') line = line.strip() #remove trailing comma if len(line) > 0: if line[-1:] == ",": line = line[:-1] fields.append(line) # If the line is a function, assign the correct return type elif inSelect and inFromClause == False and functionMatchObject: print("FieldCount: " + str(fieldCount) + " " + line) functionCandidate = functionMatchObject.group(1) functionCandidate = functionCandidate.upper() if functionCandidate in _BIG_INT_FUNCTIONS: types[columns[fieldCount]] = "bigint" elif functionCandidate in _INT_FUNCTIONS: types[columns[fieldCount]] = "integer" elif functionCandidate in _FLOAT_FUNCTIONS: types[columns[fieldCount]] = "float" else: types[columns[fieldCount]] = "varchar" fieldCount += 1 continue # Case for a regular field elif inSelect == True and re.match(fieldRegex, line): line = line.strip() # remove trailing comma from field name if len(line) > 0: if line[-1:] == ",": line = line[:-1] fields.append(line) elif inSelect == True and line.startswith('FROM'): inSelect = False inFromClause = True if inSubquery: fields.append(subqueryField) inSubquery = False else: fromClause = fromClause + " " + line.strip() elif inFromClause == True and (line.startswith('WHERE') or line.startswith('GROUP') or line.startswith('ORDER') or line.startswith('HAVING')): inFromClause = False inSelect = False elif re.match(subqueryFieldPattern, line) and inSubquery == False and inFromClause == False: inSubquery = True subqueryField = line.strip() elif inSubquery == True: subqueryField = subqueryField + " " + line.strip() if line.endswith(','): inSubquery = False fields.append(subqueryField) subqueryField = "" elif inSubquery == True and line == False: inSubquery = False fields.append(subqueryField) subqueryField = "" elif inFromClause == True: fromClause = fromClause + " " + line.strip() fieldCount += 1 typeQuery = "SELECT" fieldCount = 0 aliasPattern = r'AS\s`?[a-zA-Z_][a-zA-Z0-9-_$` ]*$' for field in fields: if re.search(aliasPattern, field): field = re.sub(aliasPattern, '', field) if fieldCount > 0: typeQuery += "," typeQuery = typeQuery + " " + field + " AS " + columns[ fieldCount] + ", typeof( " + field + ") AS " + columns[ fieldCount] + "_type" fieldCount += 1 typeQuery += fromClause typeQuery += " LIMIT 1" typeQuery = sqlparse.format(typeQuery, reindent=True, keyword_case='upper') print(typeQuery) fieldQueryResult = drill.query(typeQuery).to_dataframe() tempTypes = fieldQueryResult.T.to_dict()[0] for column in columns: if column not in types.keys(): types[column] = tempTypes[column + "_type"] print(types) return types
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time # start = time() # print("Start:", start) if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)] if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters) # print(sqlquery) # totalres = 0 if sqlquery is None or len(sqlquery) == 0: queue.put("EOF") return [] try: start = time() try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill", ex) queue.put("EOF") return if not self.drill.is_active(): print('Exception: Please run Drill first') queue.put("EOF") return # print("Drill Initialization cost:", time() - start) logger.info("Drill Initialization cost:" + str(time() - start)) start = time() if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] if len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] # logger.info(" UNION ".join(sqlquery)) processqueues = [] processes = [] res_dict = [] for sql in sqlquery: # processquery = Queue() # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict) # print(sql) processquery = Queue() processqueues.append(processquery) p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,)) p.start() processes.append(p) while len(processqueues) > 0: toremove = [] try: for q in processqueues: if q.get(False) == 'EOF': toremove.append(q) for p in processes: if p.is_alive(): p.terminate() except: pass for q in toremove: processqueues.remove(q) logger.info("Done running:") sw = " UNION ".join(sqlquery) logger.info(sw) else: card = 0 # if limit == -1: limit = 1000 if offset == -1: offset = 0 logger.info(sqlquery) # print(sqlquery) while True: query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset) cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates) card += cardinality if cardinality < limit: break offset = offset + limit # print("Exec in Drill took:", time() - start) logger.info("Exec in Drill took:" + str(time() - start)) except Exception as e: print("Exception ", e) pass # print('End:', time(), "Total results:", totalres) # print("Drill finished after: ", (time()-start)) queue.put("EOF")
from IPython.core.magic import register_line_magic # expose drill client from pydrill.client import PyDrill drill = PyDrill(host="drill", port=8047) # adjust pandas settings import pandas pandas.set_option('display.max_colwidth', -1) pandas.set_option('display.max_rows', None) import datetime from pandas.io.json import json_normalize import json # drill query wrapper def drill_query(query): res = drill.query(query) df = res.to_dataframe().dropna() df['@timestamp'] = pandas.to_datetime(df['@timestamp'], utc=True) return df # time based query - returns results from last x minutes def drill_tquery(query_s, minutes): df = query(query_s) return df[df['@timestamp'] >= datetime.datetime.now().replace( tzinfo=datetime.timezone.utc) - datetime.timedelta(minutes=minutes)]
nvmlInit() drill = "drill" # None spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if ((Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR): # Create Table Drill ------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) cs.init_drill_schema(drill, Settings.data["TestSettings"]["dataDirectory"]) # Create Table Spark ------------------------------------------------- from pyspark.sql import SparkSession spark = SparkSession.builder.appName("timestampTest").getOrCreate() cs.init_spark_schema(spark, Settings.data["TestSettings"]["dataDirectory"]) # Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data["RunSettings"]["nRals"]
def pydrill_instance(): drill = PyDrill() return drill
def main(): print('**init end2end**') Execution.getArgs() nvmlInit() dir_data_file = Settings.data['TestSettings']['dataDirectory'] nRals = Settings.data['RunSettings']['nRals'] drill = "drill" spark = "spark" compareResults = True if 'compare_results' in Settings.data['RunSettings']: compareResults = Settings.data['RunSettings']['compare_results'] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ------------------------------------------------------------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host='localhost', port=8047) createSchema.init_drill_schema( drill, Settings.data['TestSettings']['dataDirectory'], bool_test=True) # Create Table Spark ------------------------------------------------------------------------------------------------------ spark = SparkSession.builder.appName("allE2ETest").getOrCreate() createSchema.init_spark_schema( spark, Settings.data['TestSettings']['dataDirectory']) #Create Context For BlazingSQL bc, dask_client = init_context() targetTestGroups = Settings.data['RunSettings']['targetTestGroups'] runAllTests = ( len(targetTestGroups) == 0 ) # if targetTestGroups was empty the user wants to run all the tests if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups): aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("coalesceTest" in targetTestGroups): coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals) #we are not supporting coalesce yet if runAllTests or ("columnBasisTest" in targetTestGroups): columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("commonTableExpressionsTest" in targetTestGroups): commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc, nRals) #countDistincTest.main(dask_client, drill, dir_data_file, bc) #we are not supporting count distinct yet if runAllTests or ("countWithoutGroupByTest" in targetTestGroups): countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("dateTest" in targetTestGroups): dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("timestampTest" in targetTestGroups): timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("fullOuterJoinsTest" in targetTestGroups): fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("groupByTest" in targetTestGroups): groupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups): GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("innerJoinsTest" in targetTestGroups): innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("" in targetTestGroups): leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("nonEquiJoinsTest" in targetTestGroups): nonEquiJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) #loadDataTest.main(dask_client, bc) #check this if runAllTests or ("nestedQueriesTest" in targetTestGroups): nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("orderbyTest" in targetTestGroups): orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("predicatesWithNulls" in targetTestGroups): predicatesWithNulls.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("stringTests" in targetTestGroups): stringTests.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("tablesFromPandasTest" in targetTestGroups): tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unaryOpsTest" in targetTestGroups): unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unifyTablesTest" in targetTestGroups): unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unionTest" in targetTestGroups): unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("useLimitTest" in targetTestGroups): useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("whereClauseTest" in targetTestGroups): whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("bindableAliasTest" in targetTestGroups): bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("booleanTest" in targetTestGroups): booleanTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("caseTest" in targetTestGroups): caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("castTest" in targetTestGroups): castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("concatTest" in targetTestGroups): concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("literalTest" in targetTestGroups): literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("dirTest" in targetTestGroups): dirTest.main(dask_client, drill, dir_data_file, bc, nRals) #fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc) #HDFS is not working yet #mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc) #HDFS is not working yet if runAllTests or ("likeTest" in targetTestGroups): likeTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("simpleDistributionTest" in targetTestGroups): simpleDistributionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("substringTest" in targetTestGroups): substringTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("wildCardTest" in targetTestGroups): wildCardTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("tpchQueriesTest" in targetTestGroups): tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("roundTest" in targetTestGroups): roundTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemLocalTest" in targetTestGroups): fileSystemLocalTest.main(dask_client, drill, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("fileSystemS3Test" in targetTestGroups): fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemGSTest" in targetTestGroups): fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals) #timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: result, error_msgs = runTest.save_log() max = 0 for i in range(0, len(Settings.memory_list)): if (Settings.memory_list[i].delta) > max: max = Settings.memory_list[i].delta print("MAX DELTA: " + str(max)) print( '*******************************************************************************' ) for i in range(0, len(Settings.memory_list)): print(Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta)) return result, error_msgs return True, []
def test_authentication_failure(): with pytest.raises(TransportError): PyDrill(auth='user:password')
from pydrill.client import PyDrill # ====== Connection ====== # Connecting to drill by providing a drillbit ip and the drill rest api port (31000 by default) conn = PyDrill(host=os.environ['IP_DRILLBIT'], port=os.environ['DRILL_API_PORT']) # ====== Reading files ====== # The file employee.json is installed with drill as an example sample query = conn.query('SELECT * FROM cp.`employee.json` LIMIT 20', timeout=60) # Create a pandas DataFrame with the result of the query df = query.to_dataframe()
#!/usr/bin/env python from pydrill.client import PyDrill drill = PyDrill(host='10.32.48.136', port=48047, auth="jlim:2019Jfm!", use_ssl=False, verify_certs=False) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') tenants = drill.query(''' SELECT * FROM dfs.`/tsys/qa/internal/data/maprdb/tenants` LIMIT 5 ''') for result in tenants: print result
def __init__(self, *args, **kwargs): self._kwargs = kwargs self._args = args self._conn = PyDrill(**kwargs)
# IMPORTS ######################################################################################################################## import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Output, Event from flask import Flask import os from pydrill.client import PyDrill from tb.modules.trending_brand_query_engine.query_engine import tweets_per_minute,sentiment_query_engine, last_message_form_kafka from kafka import KafkaConsumer ######################################################################################################################## consumer = KafkaConsumer('beer', group_id='ui2') drill1 = PyDrill(host='localhost', port=8047) drill2 = PyDrill(host='localhost', port=8047) server = Flask('my app') server.secret_key = os.environ.get('secret_key', 'secret') app = dash.Dash('streaming-wind-app', server=server, url_base_pathname='/BEER/', csrf_protect=False) app.layout = html.Div([ html.Div([ html.H2("Trending-Brand: Beer consumption"), ], className='banner'), html.Div([ html.Div([
def main(dask_client, bc): # Create Table Drill ------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) dir_data_lc = Settings.data["TestSettings"]["dataDirectory"] for x in range(5): # [numberOfFiles, type_nation, type_region, type_supplier, # type_customer, type_lineitem, type_orders] run = [] if x == 0: run = [1, "psv", "psv", "psv", "psv", "psv", "psv"] elif x == 1: run = [ 2, "parquet", "parquet", "parquet", "parquet", "parquet", "parquet" ] elif x == 2: run = [6, "parquet", "psv", "parquet", "psv", "parquet", "psv"] elif x == 3: run = [10, "psv", "parquet", "psv", "parquet", "psv", "parquet"] elif x == 4: run = [12, "psv", "psv", "parquet", "parquet", "psv", "parquet"] print("============================================================") print("Running " + str(x + 1) + ":") print("Número de Archivos: " + str(run[0])) print("Type of files for Nation: " + run[1]) print("Type of files for Region: " + run[2]) print("Type of files for Supplier: " + run[3]) print("Type of files for Customer: " + run[4]) print("Type of files for Lineitem: " + run[5]) print("Type of files for Orders: " + run[6]) print("============================================================") print("1") num_files = run[0] print("2") cs.init_drill_schema(drill, dir_data_lc, n_files=num_files) print("3") # Read Data TPCH----------------------------------------------------- nation_files = cs.get_filenames_table("nation", dir_data_lc, num_files, run[1]) bc.create_table( "nation", nation_files, delimiter="|", dtype=cs.get_dtypes("nation"), names=cs.get_column_names("nation"), ) region_files = cs.get_filenames_table("region", dir_data_lc, num_files, run[2]) bc.create_table( "region", region_files, delimiter="|", dtype=cs.get_dtypes("region"), names=cs.get_column_names("region"), ) supplier_files = cs.get_filenames_table("supplier", dir_data_lc, num_files, run[3]) bc.create_table( "supplier", supplier_files, delimiter="|", dtype=cs.get_dtypes("supplier"), names=cs.get_column_names("supplier"), ) customer_files = cs.get_filenames_table("customer", dir_data_lc, num_files, run[4]) bc.create_table( "customer", customer_files, delimiter="|", dtype=cs.get_dtypes("customer"), names=cs.get_column_names("customer"), ) lineitem_files = cs.get_filenames_table("lineitem", dir_data_lc, num_files, run[5]) bc.create_table( "lineitem", lineitem_files, delimiter="|", dtype=cs.get_dtypes("lineitem"), names=cs.get_column_names("lineitem"), ) orders_files = cs.get_filenames_table("orders", dir_data_lc, num_files, run[6]) bc.create_table( "orders", orders_files, delimiter="|", dtype=cs.get_dtypes("orders"), names=cs.get_column_names("orders"), ) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 queryType = "Load Data Test" print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey) as c1, count(c_acctbal) as c2 from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_02" query = "select count(n_nationkey), count(n_regionkey) from nation" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_03" query = "select count(s_suppkey), count(s_nationkey) from supplier" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_04" query = """select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_05" query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_06" query = """select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_07" query = """select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_08" query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_09" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_10" query = """select sin(c_acctbal), cos(c_acctbal), sin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_11" query = """select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_12" query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_13" query = """select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_14" query = """select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984 / sum(o_totalprice), (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey) / count(81619/o_orderkey)) from orders""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_15" query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_16" query = """select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_17" query = """select count(o_orderstatus) from orders where o_orderstatus <> 'O'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_18" query = """select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_19" query = """select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_20" query = """select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_21" query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_22" query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_23" query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_24" query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) runTest.save_log()
def setUp(self): self.drill = PyDrill(host='localhost', port=8047)
def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None): c = 0 try: if not self.drill.is_active(): try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill for query processing", ex) return 0 try: results = self.drill.query(sql, timeout=1000) except Exception as ex: print("Exception while running query to Drill for query processing", ex) return 0 for row in results: c += 1 # if res_dict is not None: # rowtxt = ",".join(list(row.values())) # if rowtxt in res_dict: # continue # else: # res_dict.append(rowtxt) res = {} skip = False for r in row: if row[r] == 'null': skip = True break if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] if 'http://' in row[r]: res[s] = row[r] else: res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: if 'http://' in r: res[s] = r else: res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: if 'http://' in row[r]: res[r] = row[r] else: res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] if not skip: queue.put(res) # if 'keggCompoundId' in res: # print(res['keggCompoundId']) return c except Exception as e: print("Exception while processing drill results", e, sql) logger.error(sql) logger.error("Exception while processing results:" + str(e)) import traceback traceback.print_stack() return c
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n"; #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city="Las Vegas" yelp_reviews = drill.query(''' select sum(case when t.stars=1.0 then 1 else 0 end) as `1`, sum(case when t.stars=2.0 then 1 else 0 end) as `2`, sum(case when t.stars=2.5 then 1 else 0 end) as `3`, sum(case when t.stars=3.0 then 1 else 0 end) as `4`, sum(case when t.stars=3.5 then 1 else 0 end) as `5`, sum(case when t.stars=4.0 then 1 else 0 end) as `6`, sum(case when t.stars=4.5 then 1 else 0 end) as `7`, sum(case when t.stars=5.0 then 1 else 0 end) as `8` from `mongo.274_BI`.`yelp_dataset`t where t.city='Pittsburgh' and true=repeated_contains(categories,'Restaurants') ''') print dumps(yelp_reviews)
def main(dask_client, bc): # Create Table Drill ------------------------------------------------------------------------------------------------------ drill = PyDrill(host='localhost', port=8047) dir_data_lc = Settings.data['TestSettings']['dataDirectory'] for x in range(5): # [numberOfFiles, type_nation, type_region, type_supplier, type_customer, type_lineitem, type_orders] run = [] if x == 0: run = [1, 'psv', 'psv', 'psv', 'psv', 'psv', 'psv'] elif x == 1: run = [ 2, 'parquet', 'parquet', 'parquet', 'parquet', 'parquet', 'parquet' ] elif x == 2: run = [6, 'parquet', 'psv', 'parquet', 'psv', 'parquet', 'psv'] elif x == 3: run = [10, 'psv', 'parquet', 'psv', 'parquet', 'psv', 'parquet'] elif x == 4: run = [12, 'psv', 'psv', 'parquet', 'parquet', 'psv', 'parquet'] print( "=======================================================================" ) print("Running " + str(x + 1) + ":") print("Número de Archivos: " + str(run[0])) print("Type of files for Nation: " + run[1]) print("Type of files for Region: " + run[2]) print("Type of files for Supplier: " + run[3]) print("Type of files for Customer: " + run[4]) print("Type of files for Lineitem: " + run[5]) print("Type of files for Orders: " + run[6]) print( "=======================================================================" ) print("1") num_files = run[0] print("2") cs.init_drill_schema(drill, dir_data_lc, n_files=num_files) print("3") #Read Data TPCH------------------------------------------------------------------------------------------------------------ nation_files = cs.get_filenames_table('nation', dir_data_lc, num_files, run[1]) bc.create_table('nation', nation_files, delimiter='|', dtype=cs.get_dtypes('nation'), names=cs.get_column_names('nation')) region_files = cs.get_filenames_table('region', dir_data_lc, num_files, run[2]) bc.create_table('region', region_files, delimiter='|', dtype=cs.get_dtypes('region'), names=cs.get_column_names('region')) supplier_files = cs.get_filenames_table('supplier', dir_data_lc, num_files, run[3]) bc.create_table('supplier', supplier_files, delimiter='|', dtype=cs.get_dtypes('supplier'), names=cs.get_column_names('supplier')) customer_files = cs.get_filenames_table('customer', dir_data_lc, num_files, run[4]) bc.create_table('customer', customer_files, delimiter='|', dtype=cs.get_dtypes('customer'), names=cs.get_column_names('customer')) lineitem_files = cs.get_filenames_table('lineitem', dir_data_lc, num_files, run[5]) bc.create_table('lineitem', lineitem_files, delimiter='|', dtype=cs.get_dtypes('lineitem'), names=cs.get_column_names('lineitem')) orders_files = cs.get_filenames_table('orders', dir_data_lc, num_files, run[6]) bc.create_table('orders', orders_files, delimiter='|', dtype=cs.get_dtypes('orders'), names=cs.get_column_names('orders')) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 queryType = 'Load Data Test' print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select count(c_custkey) as c1, count(c_acctbal) as c2 from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select count(n_nationkey), count(n_regionkey) from nation" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select count(s_suppkey), count(s_nationkey) from supplier" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = "select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_05' query = "select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = "select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = "select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >=30" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = "select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = "select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = "select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_13' query = "select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_14' query = "select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey))/(sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_15' query = "select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_16' query = "select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' query = "select count(o_orderstatus) from orders where o_orderstatus <> 'O'" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_18' query = "select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_19' query = "select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True) #TODO: Change sum/count for avg KC queryId = 'TEST_20' query = "select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_21' query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_22' query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_23' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_24' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) runTest.save_log()