예제 #1
0
파일: __main__.py 프로젝트: xav-b/datacli
def cli(dbname, host, port, timeout, interactive, version):
    """Cli entry point."""
    if version or dbname == 'version':
        print('datacli version: {}'.format(__version__))
        sys.exit(0)

    configure_logger()

    conn = PyDrill(host=host, port=port)
    if not conn.is_active():
        log.error('unable to reach Drill server')
        return 1

    cli = DataCli(conn, dbname, DataPrompt(), timeout=timeout)

    log.info('connected to Drillbit')
    while True:
        try:
            should_exit = cli.repl(interactive)
            if should_exit:
                break
        except KeyboardInterrupt:
            break  # Control-C pressed
        except EOFError:
            break  # Control-D pressed

    log.info('shutting down...')
    return 0
예제 #2
0
파일: db.py 프로젝트: edblancas/ploomber
    def connection(self):
        from pydrill.client import PyDrill

        if self._connection is None:
            self._connection = PyDrill(**self.params)

        return self._connection
예제 #3
0
def test_authentication_success(pydrill_url):

    responses.add(
        **{
            'method': responses.POST,
            'url': "{0}/{1}".format(pydrill_url, 'j_security_check'),
        })

    PyDrill(auth='user:password')
예제 #4
0
def main():
    print("**init end2end**")

    Execution.getArgs()

    dir_data_file = Settings.data["TestSettings"]["dataDirectory"]

    # Create Table Drill -----------------------------------------
    drill = PyDrill(host="localhost", port=8047)
    createSchema.init_drill_schema(drill, dir_data_file)

    # Sólo pasan todos los test con 100Mb
    csvFromLocalTest.main(drill, dir_data_file)

    csvFromS3Test.main(
        drill, dir_data_file
    )  # AttributeError: 'NoneType' object has no attribute '_cols'

    # vector::_M_range_check: __n
    # (which is 18446744073709551615) >= this->size() (which is 2)
    csvFromHdfsTest.main(
        drill, dir_data_file
    )

    parquetFromLocalTest.main(
        drill, dir_data_file
    )  # Sólo pasan todos los test con 100Mb

    # Pasan todos los test con 100Mb, con multiples archivos para
    # una tabla no porque no se carga bien todos los archivos.
    parquetFromS3Test.main(
        drill, dir_data_file
    )

    parquetFromHdfsTest.main(
        drill, dir_data_file
    )  # Se queda pensando en la lectura de data

    runTest.save_log()

    for i in range(0, len(Settings.memory_list)):
        print(
            Settings.memory_list[i].name
            + ":"
            + "   Start Mem: "
            + str(Settings.memory_list[i].start_mem)
            + "   End Mem: "
            + str(Settings.memory_list[i].end_mem)
            + "   Diff: "
            + str(Settings.memory_list[i].delta)
        )
예제 #5
0
def init_drill():
    # Start Drill schema-----------------------------------------
    from pydrill.client import PyDrill

    drill = PyDrill(host="localhost", port=8047)
    createSchema.init_drill_schema(
        drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True)
    createSchema.init_drill_schema(
        drill,
        Settings.data["TestSettings"]["dataDirectory"],
        smiles_test=True,
        fileSchemaType=DataType.PARQUET)

    return drill
예제 #6
0
def main():
    print('**init performance test**')
    Execution.getArgs()

    dir_data_file = Settings.data['TestSettings']['dataDirectory']

    # Create Table Drill ------------------------------------------------------------------------------------------------------
    drill = PyDrill(host='localhost', port=8047)
    createSchema.init_drill_schema(drill, dir_data_file)

    jobId = 1

    if Settings.data['MysqlConnection']['connectEnabled']:
        from DataBase import mysqlDatabaseManager as msqldb
        jobId = msqldb.getJobId()

    for x in range(0, 10):
        performanceTest.main(drill, dir_data_file)
        runTest.save_log(job_id=jobId)
예제 #7
0
def main():
    print("**init end2end**")
    Execution.getArgs()
    nvmlInit()
    dir_data_file = Settings.data["TestSettings"]["dataDirectory"]
    nRals = Settings.data["RunSettings"]["nRals"]

    drill = "drill"
    spark = "spark"
    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults
            == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:

        # Create Table Drill -----------------------------------------
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        createSchema.init_drill_schema(
            drill,
            Settings.data["TestSettings"]["dataDirectory"],
            bool_test=True)
        createSchema.init_drill_schema(
            drill,
            Settings.data["TestSettings"]["dataDirectory"],
            smiles_test=True,
            fileSchemaType=DataType.PARQUET)

        # Create Table Spark -------------------------------------------------
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("allE2ETest").getOrCreate()
        createSchema.init_spark_schema(
            spark, Settings.data["TestSettings"]["dataDirectory"])
        createSchema.init_spark_schema(
            spark,
            Settings.data["TestSettings"]["dataDirectory"],
            smiles_test=True,
            fileSchemaType=DataType.PARQUET)

    targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"]

    # only innerJoinsTest will be with progress bar
    useProgressBar = False
    if "innerJoinsTest" in targetTestGroups:
        useProgressBar = True

    print("Using progress bar: ", useProgressBar)

    # Create Context For BlazingSQL
    bc, dask_client = init_context(useProgressBar=useProgressBar)

    runAllTests = (
        len(targetTestGroups) == 0
    )  # if targetTestGroups was empty the user wants to run all the tests

    if runAllTests or ("hiveFileTest" in targetTestGroups):
        hiveFileTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups):
        aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file,
                                            bc, nRals)

    if runAllTests or ("coalesceTest" in targetTestGroups):
        coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("columnBasisTest" in targetTestGroups):
        columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("commonTableExpressionsTest" in targetTestGroups):
        commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc,
                                        nRals)

    if runAllTests or ("countDistinctTest" in targetTestGroups):
        countDistinctTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("countWithoutGroupByTest" in targetTestGroups):
        countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc,
                                     nRals)

    if runAllTests or ("dateTest" in targetTestGroups):
        dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("timestampTest" in targetTestGroups):
        timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("toTimestampTest" in targetTestGroups):
        toTimestampTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dayOfWeekTest" in targetTestGroups):
        dayOfWeekTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("fullOuterJoinsTest" in targetTestGroups):
        fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("groupByTest" in targetTestGroups):
        groupByTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups):
        GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc,
                                       nRals)

    if runAllTests or ("innerJoinsTest" in targetTestGroups):
        innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("crossJoinsTest" in targetTestGroups):
        crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("leftOuterJoinsTest" in targetTestGroups):
        leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("nonEquiJoinsTest" in targetTestGroups):
        nonEquiJoinsTest.main(dask_client, drill, spark, dir_data_file, bc,
                              nRals)

    # loadDataTest.main(dask_client, bc) #check this

    if runAllTests or ("nestedQueriesTest" in targetTestGroups):
        nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("orderbyTest" in targetTestGroups):
        orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("predicatesWithNulls" in targetTestGroups):
        predicatesWithNulls.main(dask_client, drill, spark, dir_data_file, bc,
                                 nRals)

    if runAllTests or ("stringTests" in targetTestGroups):
        stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("tablesFromPandasTest" in targetTestGroups):
        tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unaryOpsTest" in targetTestGroups):
        unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unifyTablesTest" in targetTestGroups):
        unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unionTest" in targetTestGroups):
        unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("useLimitTest" in targetTestGroups):
        useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("whereClauseTest" in targetTestGroups):
        whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("bindableAliasTest" in targetTestGroups):
        bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if runAllTests or ("booleanTest" in targetTestGroups):
        booleanTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("caseTest" in targetTestGroups):
        caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("castTest" in targetTestGroups):
        castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("concatTest" in targetTestGroups):
        concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("literalTest" in targetTestGroups):
        literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dirTest" in targetTestGroups):
        dirTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    # HDFS is not working yet
    # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc)

    # HDFS is not working yet
    # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc)

    if runAllTests or ("likeTest" in targetTestGroups):
        likeTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("substringTest" in targetTestGroups):
        substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("stringCaseTest" in targetTestGroups):
        stringCaseTest.main(dask_client, drill, spark, dir_data_file, bc,
                            nRals)

    if runAllTests or ("wildCardTest" in targetTestGroups):
        wildCardTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("tpchQueriesTest" in targetTestGroups):
        tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc,
                             nRals)

    if runAllTests or ("roundTest" in targetTestGroups):
        roundTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("fileSystemLocalTest" in targetTestGroups):
        fileSystemLocalTest.main(dask_client, drill, spark, dir_data_file, bc,
                                 nRals)

    if runAllTests or ("messageValidationTest" in targetTestGroups):
        messageValidationTest.main(dask_client, drill, dir_data_file, bc,
                                   nRals)

    testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"]
    if testsWithNulls != "true":
        if Settings.execution_mode != ExecutionMode.GPUCI:
            if runAllTests or ("fileSystemS3Test" in targetTestGroups):
                fileSystemS3Test.main(dask_client, drill, dir_data_file, bc,
                                      nRals)

            if runAllTests or ("fileSystemGSTest" in targetTestGroups):
                fileSystemGSTest.main(dask_client, drill, dir_data_file, bc,
                                      nRals)

    if runAllTests or ("loggingTest" in targetTestGroups):
        loggingTest.main(dask_client, dir_data_file, bc, nRals)

    # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals)

    #TODO re enable this test once we have the new version of dask
    # https://github.com/dask/distributed/issues/4645
    # https://github.com/rapidsai/cudf/issues/7773
    #if runAllTests or ("smilesTest" in targetTestGroups):
    #    smilesTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if testsWithNulls != "true":
        if runAllTests or ("jsonTest" in targetTestGroups):
            jsonTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("windowFunctionTest" in targetTestGroups):
        windowFunctionTest.main(dask_client, drill, spark, dir_data_file, bc,
                                nRals)

    if runAllTests or ("windowNoPartitionTest" in targetTestGroups):
        windowNoPartitionTest.main(dask_client, drill, spark, dir_data_file,
                                   bc, nRals)

    if testsWithNulls != "true":
        if runAllTests or ("concurrentTest" in targetTestGroups):
            concurrentTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if testsWithNulls == "true":
        if Settings.execution_mode != ExecutionMode.GPUCI:
            if runAllTests or ("tablesFromSQL" in targetTestGroups):
                tablesFromSQL.main(dask_client, drill, dir_data_file, bc,
                                   nRals)

    # WARNING!!! This Test must be the last one to test -------------------------------------------------------------------------------------------------------------------------------------------
    if runAllTests or ("configOptionsTest" in targetTestGroups):
        configOptionsTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:

        result, error_msgs = runTest.save_log(
            Settings.execution_mode == ExecutionMode.GPUCI)

        max = 0
        for i in range(0, len(Settings.memory_list)):
            if (Settings.memory_list[i].delta) > max:
                max = Settings.memory_list[i].delta

        print("MAX DELTA: " + str(max))
        print("""***********************************************************
              ********************""")

        for i in range(0, len(Settings.memory_list)):
            print(Settings.memory_list[i].name + ":" + "   Start Mem: " +
                  str(Settings.memory_list[i].start_mem) + "   End Mem: " +
                  str(Settings.memory_list[i].end_mem) + "   Diff: " +
                  str(Settings.memory_list[i].delta))

        return result, error_msgs

    return True, []
def get_column_types(query):

    drill = PyDrill(host='localhost', port=8049)
    data = drill.query(query)
    columns = data.columns
    types = {}

    formattedQuery = sqlparse.format(query,
                                     reindent=True,
                                     keyword_case='upper')
    formattedQuery = formattedQuery.split('\n')
    print(sqlparse.format(query, reindent=True, keyword_case='upper'))
    inSelect = False
    inSubquery = False
    inFromClause = False
    fields = []

    fieldRegex = r'\s{7}\S'
    fieldSubquery = r'\s'
    subqueryFieldPattern = r'\s{2,3}\S'
    subqueryField = ""

    fromClause = ""
    functionPattern = r'\s+(\S+)\('

    fieldCount = 0
    for line in formattedQuery:
        functionMatchObject = re.match(functionPattern, line)

        if line.startswith('SELECT'):
            inSelect = True
            line = line.replace('SELECT', '')

            line = line.strip()
            #remove trailing comma
            if len(line) > 0:
                if line[-1:] == ",":
                    line = line[:-1]
            fields.append(line)

        # If the line is a function, assign the correct return type
        elif inSelect and inFromClause == False and functionMatchObject:
            print("FieldCount: " + str(fieldCount) + " " + line)
            functionCandidate = functionMatchObject.group(1)
            functionCandidate = functionCandidate.upper()
            if functionCandidate in _BIG_INT_FUNCTIONS:
                types[columns[fieldCount]] = "bigint"

            elif functionCandidate in _INT_FUNCTIONS:
                types[columns[fieldCount]] = "integer"

            elif functionCandidate in _FLOAT_FUNCTIONS:
                types[columns[fieldCount]] = "float"

            else:
                types[columns[fieldCount]] = "varchar"

            fieldCount += 1
            continue

        # Case for a regular field
        elif inSelect == True and re.match(fieldRegex, line):
            line = line.strip()
            # remove trailing comma from field name
            if len(line) > 0:
                if line[-1:] == ",":
                    line = line[:-1]
            fields.append(line)

        elif inSelect == True and line.startswith('FROM'):
            inSelect = False
            inFromClause = True
            if inSubquery:
                fields.append(subqueryField)
                inSubquery = False
            else:
                fromClause = fromClause + " " + line.strip()

        elif inFromClause == True and (line.startswith('WHERE')
                                       or line.startswith('GROUP')
                                       or line.startswith('ORDER')
                                       or line.startswith('HAVING')):
            inFromClause = False
            inSelect = False

        elif re.match(subqueryFieldPattern,
                      line) and inSubquery == False and inFromClause == False:
            inSubquery = True
            subqueryField = line.strip()

        elif inSubquery == True:
            subqueryField = subqueryField + " " + line.strip()
            if line.endswith(','):
                inSubquery = False
                fields.append(subqueryField)
                subqueryField = ""

        elif inSubquery == True and line == False:
            inSubquery = False
            fields.append(subqueryField)
            subqueryField = ""

        elif inFromClause == True:
            fromClause = fromClause + " " + line.strip()

        fieldCount += 1

    typeQuery = "SELECT"
    fieldCount = 0
    aliasPattern = r'AS\s`?[a-zA-Z_][a-zA-Z0-9-_$` ]*$'
    for field in fields:
        if re.search(aliasPattern, field):
            field = re.sub(aliasPattern, '', field)

        if fieldCount > 0:
            typeQuery += ","
        typeQuery = typeQuery + " " + field + " AS " + columns[
            fieldCount] + ", typeof( " + field + ") AS " + columns[
                fieldCount] + "_type"
        fieldCount += 1

    typeQuery += fromClause
    typeQuery += " LIMIT 1"
    typeQuery = sqlparse.format(typeQuery, reindent=True, keyword_case='upper')

    print(typeQuery)
    fieldQueryResult = drill.query(typeQuery).to_dataframe()
    tempTypes = fieldQueryResult.T.to_dict()[0]

    for column in columns:
        if column not in types.keys():
            types[column] = tempTypes[column + "_type"]

    print(types)
    return types
예제 #9
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        # start = time()
        # print("Start:", start)
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)]

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters)
        # print(sqlquery)
        # totalres = 0
        if sqlquery is None or len(sqlquery) == 0:
            queue.put("EOF")
            return []
        try:
            start = time()
            try:
                self.drill = PyDrill(host=self.host, port=self.port)
            except Exception as ex:
                print("Exception while connecting to Drill", ex)
                queue.put("EOF")
                return
            if not self.drill.is_active():
                print('Exception: Please run Drill first')
                queue.put("EOF")
                return
            # print("Drill Initialization cost:", time() - start)
            logger.info("Drill Initialization cost:" + str(time() - start))
            start = time()
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                if len(sqlquery) > 3:
                    sqlquery = " UNION ".join(sqlquery)
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                # logger.info(" UNION ".join(sqlquery))
                processqueues = []
                processes = []
                res_dict = []
                for sql in sqlquery:
                    # processquery = Queue()
                    # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict)
                    # print(sql)
                    processquery = Queue()
                    processqueues.append(processquery)
                    p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,))
                    p.start()
                    processes.append(p)

                while len(processqueues) > 0:
                    toremove = []
                    try:
                        for q in processqueues:
                            if q.get(False) == 'EOF':
                                toremove.append(q)
                        for p in processes:
                            if p.is_alive():
                                p.terminate()
                    except:
                        pass
                    for q in toremove:
                        processqueues.remove(q)
                logger.info("Done running:")
                sw = " UNION ".join(sqlquery)
                logger.info(sw)
            else:
                card = 0
                # if limit == -1:
                limit = 1000
                if offset == -1:
                    offset = 0
                logger.info(sqlquery)
                # print(sqlquery)
                while True:
                    query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset)
                    cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates)
                    card += cardinality
                    if cardinality < limit:
                        break

                    offset = offset + limit
            # print("Exec in Drill took:", time() - start)
            logger.info("Exec in Drill took:" + str(time() - start))
        except Exception as e:
            print("Exception ", e)
            pass
        # print('End:', time(), "Total results:", totalres)
        # print("Drill finished after: ", (time()-start))
        queue.put("EOF")
예제 #10
0
from IPython.core.magic import register_line_magic

# expose drill client
from pydrill.client import PyDrill
drill = PyDrill(host="drill", port=8047)

# adjust pandas settings
import pandas
pandas.set_option('display.max_colwidth', -1)
pandas.set_option('display.max_rows', None)

import datetime

from pandas.io.json import json_normalize
import json


# drill query wrapper
def drill_query(query):
    res = drill.query(query)
    df = res.to_dataframe().dropna()
    df['@timestamp'] = pandas.to_datetime(df['@timestamp'], utc=True)
    return df


# time based query - returns results from last x minutes
def drill_tquery(query_s, minutes):
    df = query(query_s)
    return df[df['@timestamp'] >= datetime.datetime.now().replace(
        tzinfo=datetime.timezone.utc) - datetime.timedelta(minutes=minutes)]
예제 #11
0
    nvmlInit()

    drill = "drill"  # None
    spark = "spark"

    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if ((Settings.execution_mode == ExecutionMode.FULL
         and compareResults == "true")
            or Settings.execution_mode == ExecutionMode.GENERATOR):
        # Create Table Drill ------------------------------------------------
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        cs.init_drill_schema(drill,
                             Settings.data["TestSettings"]["dataDirectory"])

        # Create Table Spark -------------------------------------------------
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("timestampTest").getOrCreate()
        cs.init_spark_schema(spark,
                             Settings.data["TestSettings"]["dataDirectory"])

    # Create Context For BlazingSQL

    bc, dask_client = init_context()

    nRals = Settings.data["RunSettings"]["nRals"]
예제 #12
0
def pydrill_instance():
    drill = PyDrill()
    return drill
예제 #13
0
def main():
    print('**init end2end**')
    Execution.getArgs()
    nvmlInit()
    dir_data_file = Settings.data['TestSettings']['dataDirectory']
    nRals = Settings.data['RunSettings']['nRals']

    drill = "drill"
    spark = "spark"

    compareResults = True
    if 'compare_results' in Settings.data['RunSettings']:
        compareResults = Settings.data['RunSettings']['compare_results']

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults
            == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:

        # Create Table Drill ------------------------------------------------------------------------------------------------------
        from pydrill.client import PyDrill
        drill = PyDrill(host='localhost', port=8047)
        createSchema.init_drill_schema(
            drill,
            Settings.data['TestSettings']['dataDirectory'],
            bool_test=True)

        # Create Table Spark ------------------------------------------------------------------------------------------------------
        spark = SparkSession.builder.appName("allE2ETest").getOrCreate()
        createSchema.init_spark_schema(
            spark, Settings.data['TestSettings']['dataDirectory'])

    #Create Context For BlazingSQL
    bc, dask_client = init_context()

    targetTestGroups = Settings.data['RunSettings']['targetTestGroups']
    runAllTests = (
        len(targetTestGroups) == 0
    )  # if targetTestGroups was empty the user wants to run all the tests

    if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups):
        aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file,
                                            bc, nRals)

    if runAllTests or ("coalesceTest" in targetTestGroups):
        coalesceTest.main(dask_client, drill, dir_data_file, bc,
                          nRals)  #we are not supporting coalesce yet

    if runAllTests or ("columnBasisTest" in targetTestGroups):
        columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("commonTableExpressionsTest" in targetTestGroups):
        commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc,
                                        nRals)

    #countDistincTest.main(dask_client, drill, dir_data_file, bc) #we are not supporting count distinct yet

    if runAllTests or ("countWithoutGroupByTest" in targetTestGroups):
        countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc,
                                     nRals)

    if runAllTests or ("dateTest" in targetTestGroups):
        dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("timestampTest" in targetTestGroups):
        timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("fullOuterJoinsTest" in targetTestGroups):
        fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("groupByTest" in targetTestGroups):
        groupByTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups):
        GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc,
                                       nRals)

    if runAllTests or ("innerJoinsTest" in targetTestGroups):
        innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("" in targetTestGroups):
        leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("nonEquiJoinsTest" in targetTestGroups):
        nonEquiJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    #loadDataTest.main(dask_client, bc) #check this

    if runAllTests or ("nestedQueriesTest" in targetTestGroups):
        nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("orderbyTest" in targetTestGroups):
        orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("predicatesWithNulls" in targetTestGroups):
        predicatesWithNulls.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("stringTests" in targetTestGroups):
        stringTests.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("tablesFromPandasTest" in targetTestGroups):
        tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unaryOpsTest" in targetTestGroups):
        unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unifyTablesTest" in targetTestGroups):
        unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unionTest" in targetTestGroups):
        unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("useLimitTest" in targetTestGroups):
        useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("whereClauseTest" in targetTestGroups):
        whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("bindableAliasTest" in targetTestGroups):
        bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if runAllTests or ("booleanTest" in targetTestGroups):
        booleanTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("caseTest" in targetTestGroups):
        caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("castTest" in targetTestGroups):
        castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("concatTest" in targetTestGroups):
        concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("literalTest" in targetTestGroups):
        literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dirTest" in targetTestGroups):
        dirTest.main(dask_client, drill, dir_data_file, bc, nRals)

    #fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc) #HDFS is not working yet

    #mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc) #HDFS is not working yet

    if runAllTests or ("likeTest" in targetTestGroups):
        likeTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("simpleDistributionTest" in targetTestGroups):
        simpleDistributionTest.main(dask_client, drill, spark, dir_data_file,
                                    bc, nRals)

    if runAllTests or ("substringTest" in targetTestGroups):
        substringTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("wildCardTest" in targetTestGroups):
        wildCardTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("tpchQueriesTest" in targetTestGroups):
        tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc,
                             nRals)

    if runAllTests or ("roundTest" in targetTestGroups):
        roundTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("fileSystemLocalTest" in targetTestGroups):
        fileSystemLocalTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if Settings.execution_mode != ExecutionMode.GPUCI:
        if runAllTests or ("fileSystemS3Test" in targetTestGroups):
            fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals)

        if runAllTests or ("fileSystemGSTest" in targetTestGroups):
            fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals)

    #timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:

        result, error_msgs = runTest.save_log()

        max = 0
        for i in range(0, len(Settings.memory_list)):
            if (Settings.memory_list[i].delta) > max:
                max = Settings.memory_list[i].delta

        print("MAX DELTA: " + str(max))
        print(
            '*******************************************************************************'
        )

        for i in range(0, len(Settings.memory_list)):
            print(Settings.memory_list[i].name + ":" + "   Start Mem: " +
                  str(Settings.memory_list[i].start_mem) + "   End Mem: " +
                  str(Settings.memory_list[i].end_mem) + "   Diff: " +
                  str(Settings.memory_list[i].delta))

        return result, error_msgs

    return True, []
예제 #14
0
def test_authentication_failure():
    with pytest.raises(TransportError):
        PyDrill(auth='user:password')
from pydrill.client import PyDrill

# ====== Connection ======
# Connecting to drill by providing a drillbit ip and the drill rest api port (31000 by default)
conn = PyDrill(host=os.environ['IP_DRILLBIT'],
               port=os.environ['DRILL_API_PORT'])

# ====== Reading files ======
# The file employee.json is installed with drill as an example sample
query = conn.query('SELECT * FROM cp.`employee.json` LIMIT 20', timeout=60)

# Create a pandas DataFrame with the result of the query
df = query.to_dataframe()
예제 #16
0
#!/usr/bin/env python

from pydrill.client import PyDrill
drill = PyDrill(host='10.32.48.136',
                port=48047,
                auth="jlim:2019Jfm!",
                use_ssl=False,
                verify_certs=False)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

tenants = drill.query('''
  SELECT * FROM
  dfs.`/tsys/qa/internal/data/maprdb/tenants`
  LIMIT 5
''')

for result in tenants:
    print result
예제 #17
0
 def __init__(self, *args, **kwargs):
     self._kwargs = kwargs
     self._args = args
     self._conn = PyDrill(**kwargs)
예제 #18
0
# IMPORTS
########################################################################################################################
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Event
from flask import Flask
import os
from pydrill.client import PyDrill
from tb.modules.trending_brand_query_engine.query_engine import  tweets_per_minute,sentiment_query_engine, last_message_form_kafka
from kafka import KafkaConsumer
########################################################################################################################


consumer = KafkaConsumer('beer', group_id='ui2')
drill1 = PyDrill(host='localhost', port=8047)
drill2 = PyDrill(host='localhost', port=8047)

server = Flask('my app')
server.secret_key = os.environ.get('secret_key', 'secret')

app = dash.Dash('streaming-wind-app', server=server,
                url_base_pathname='/BEER/',
                csrf_protect=False)

app.layout = html.Div([
    html.Div([
        html.H2("Trending-Brand: Beer consumption"),
    ], className='banner'),
    html.Div([
        html.Div([
예제 #19
0
def main(dask_client, bc):

    # Create Table Drill ------------------------------------------------

    from pydrill.client import PyDrill

    drill = PyDrill(host="localhost", port=8047)

    dir_data_lc = Settings.data["TestSettings"]["dataDirectory"]

    for x in range(5):

        # [numberOfFiles, type_nation, type_region, type_supplier,
        #  type_customer, type_lineitem, type_orders]

        run = []

        if x == 0:
            run = [1, "psv", "psv", "psv", "psv", "psv", "psv"]
        elif x == 1:
            run = [
                2, "parquet", "parquet", "parquet", "parquet", "parquet",
                "parquet"
            ]
        elif x == 2:
            run = [6, "parquet", "psv", "parquet", "psv", "parquet", "psv"]
        elif x == 3:
            run = [10, "psv", "parquet", "psv", "parquet", "psv", "parquet"]
        elif x == 4:
            run = [12, "psv", "psv", "parquet", "parquet", "psv", "parquet"]

        print("============================================================")
        print("Running " + str(x + 1) + ":")
        print("Número de Archivos: " + str(run[0]))
        print("Type of files for Nation: " + run[1])
        print("Type of files for Region: " + run[2])
        print("Type of files for Supplier: " + run[3])
        print("Type of files for Customer: " + run[4])
        print("Type of files for Lineitem: " + run[5])
        print("Type of files for Orders: " + run[6])
        print("============================================================")
        print("1")
        num_files = run[0]
        print("2")
        cs.init_drill_schema(drill, dir_data_lc, n_files=num_files)
        print("3")
        # Read Data TPCH-----------------------------------------------------
        nation_files = cs.get_filenames_table("nation", dir_data_lc, num_files,
                                              run[1])
        bc.create_table(
            "nation",
            nation_files,
            delimiter="|",
            dtype=cs.get_dtypes("nation"),
            names=cs.get_column_names("nation"),
        )

        region_files = cs.get_filenames_table("region", dir_data_lc, num_files,
                                              run[2])
        bc.create_table(
            "region",
            region_files,
            delimiter="|",
            dtype=cs.get_dtypes("region"),
            names=cs.get_column_names("region"),
        )

        supplier_files = cs.get_filenames_table("supplier", dir_data_lc,
                                                num_files, run[3])
        bc.create_table(
            "supplier",
            supplier_files,
            delimiter="|",
            dtype=cs.get_dtypes("supplier"),
            names=cs.get_column_names("supplier"),
        )

        customer_files = cs.get_filenames_table("customer", dir_data_lc,
                                                num_files, run[4])
        bc.create_table(
            "customer",
            customer_files,
            delimiter="|",
            dtype=cs.get_dtypes("customer"),
            names=cs.get_column_names("customer"),
        )

        lineitem_files = cs.get_filenames_table("lineitem", dir_data_lc,
                                                num_files, run[5])
        bc.create_table(
            "lineitem",
            lineitem_files,
            delimiter="|",
            dtype=cs.get_dtypes("lineitem"),
            names=cs.get_column_names("lineitem"),
        )

        orders_files = cs.get_filenames_table("orders", dir_data_lc, num_files,
                                              run[6])
        bc.create_table(
            "orders",
            orders_files,
            delimiter="|",
            dtype=cs.get_dtypes("orders"),
            names=cs.get_column_names("orders"),
        )

        # Run Query ------------------------------------------------------
        # Parameter to indicate if its necessary to order
        # the resulsets before compare them
        worder = 1
        use_percentage = False
        acceptable_difference = 0.01
        queryType = "Load Data Test"

        print("==============================")
        print(queryType)
        print("==============================")

        queryId = "TEST_01"
        query = """select count(c_custkey) as c1, count(c_acctbal) as c2
                from customer"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_02"
        query = "select count(n_nationkey), count(n_regionkey) from nation"
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_03"
        query = "select count(s_suppkey), count(s_nationkey) from supplier"
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_04"
        query = """select count(c_custkey), sum(c_acctbal),
                    sum(c_acctbal)/count(c_acctbal), min(c_custkey),
                    max(c_nationkey),
                    (max(c_nationkey) + min(c_nationkey))/2 c_nationkey
                    from customer
                    where c_custkey < 100 group by c_nationkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_05"
        query = """select c.c_custkey, c.c_nationkey, n.n_regionkey
                from customer as c inner join nation as n
                on c.c_nationkey = n.n_nationkey
                where n.n_regionkey = 1 and c.c_custkey < 50"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_06"
        query = """select c_custkey, c_nationkey, c_acctbal
                from customer order by c_nationkey, c_acctbal"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            0,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_07"
        query = """select c_custkey + c_nationkey, c_acctbal
                from customer order by 1, 2"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            0,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_08"
        query = """select n1.n_nationkey as supp_nation,
                n2.n_nationkey as cust_nation,
                l.l_extendedprice * l.l_discount
                from supplier as s
                inner join lineitem as l on s.s_suppkey = l.l_suppkey
                inner join orders as o on o.o_orderkey = l.l_orderkey
                inner join customer as c on c.c_custkey = o.o_custkey
                inner join nation as n1 on s.s_nationkey = n1.n_nationkey
                inner join nation as n2 on c.c_nationkey = n2.n_nationkey
                where n1.n_nationkey = 1
                and n2.n_nationkey = 2 and o.o_orderkey < 10000"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_09"
        query = """select c_custkey, c_nationkey as nkey
                from customer where c_custkey < 0 and c_nationkey >= 30"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_10"
        query = """select sin(c_acctbal), cos(c_acctbal),
                    sin(c_acctbal), acos(c_acctbal),
                    ln(c_acctbal), tan(c_acctbal),
                    atan(c_acctbal), floor(c_acctbal),
                    c_acctbal
                from customer"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_11"
        query = """select n1.n_nationkey as n1key,
                n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey
                from nation as n1 full outer join nation as n2
                on n1.n_nationkey = n2.n_nationkey + 6
                where n1.n_nationkey < 10 and n1.n_nationkey > 5"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_12"
        query = """select count(n1.n_nationkey) as n1key,
                    count(n2.n_nationkey) as n2key, count(*) as cstar
                from nation as n1 full outer join nation as n2
                on n1.n_nationkey = n2.n_nationkey + 6"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_13"
        query = """select o_orderkey, o_custkey from orders
                where o_orderkey < 10 and o_orderkey >= 1"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_14"
        query = """select 100168549 - sum(o_orderkey)/count(o_orderkey),
                    56410984 / sum(o_totalprice),
                    (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey) /
                    count(81619/o_orderkey))
                from orders"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_15"
        query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus)
                from orders where o_custkey < 100
                group by o_orderstatus, o_orderkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_16"
        query = """select o_orderkey, o_orderstatus
                from orders where o_custkey < 10
                and o_orderstatus <> 'O'
                order by o_orderkey, o_orderstatus
                limit 50"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_17"
        query = """select count(o_orderstatus)
                from orders where o_orderstatus <> 'O'"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_18"
        query = """select count(o_orderkey), sum(o_orderkey), o_clerk
                from orders where o_custkey < 1000
                group by o_clerk, o_orderstatus"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_19"
        query = """select sum(o_orderkey)/count(o_orderkey)
                from orders group by o_orderstatus"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_20"
        query = """select count(o_shippriority), sum(o_totalprice)
                from orders group by o_shippriority"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_21"
        query = """with regionTemp as ( select r_regionkey,
                 r_name from region where r_regionkey > 2 ),
                nationTemp as(select n_nationkey, n_regionkey as fkey,
                n_name from nation where n_nationkey > 3
                order by n_nationkey)
                select regionTemp.r_name, nationTemp.n_name
                from regionTemp inner join nationTemp on
                regionTemp.r_regionkey = nationTemp.fkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_22"
        query = """select o.o_totalprice, l.l_partkey
                from orders as o
                left outer join lineitem as l on o.o_custkey = l.l_linenumber
                and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_23"
        query = """select o.o_orderkey, o.o_totalprice,
                l.l_partkey, l.l_returnflag from lineitem as l
                inner join orders as o on o.o_orderkey = l.l_orderkey
                inner join customer as c on c.c_custkey = o.o_custkey
                where c.c_custkey < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_24"
        query = """select o.o_orderkey, o.o_totalprice,
                l.l_partkey, l.l_linestatus from orders as o
                full outer join lineitem as l on
                l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        runTest.save_log()
 def setUp(self):
     self.drill = PyDrill(host='localhost', port=8047)
예제 #21
0
    def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None):
        c = 0
        try:
            if not self.drill.is_active():
                try:
                    self.drill = PyDrill(host=self.host, port=self.port)
                except Exception as ex:
                    print("Exception while connecting to Drill for query processing", ex)
                    return 0
            try:
                results = self.drill.query(sql, timeout=1000)
            except Exception as ex:
                print("Exception while running query to Drill for query processing", ex)
                return 0

            for row in results:
                c += 1
                # if res_dict is not None:
                #     rowtxt = ",".join(list(row.values()))
                #     if rowtxt in res_dict:
                #         continue
                #     else:
                #         res_dict.append(rowtxt)

                res = {}
                skip = False
                for r in row:
                    if row[r] == 'null':
                        skip = True
                        break
                    if '_' in r and r[:r.find("_")] in projvartocols:
                        s = r[:r.find("_")]
                        if s in res:
                            val = res[s]
                            if 'http://' in row[r]:
                                res[s] = row[r]
                            else:
                                res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_'))
                        else:
                            if 'http://' in r:
                                res[s] = r
                            else:
                                res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}',
                                                                   row[r].replace(" ", '_'))
                    elif r in projvartocols and r in coltotemplates:
                        if 'http://' in row[r]:
                            res[r] = row[r]
                        else:
                            res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_'))
                    else:
                        res[r] = row[r]

                if not skip:
                    queue.put(res)
                    # if 'keggCompoundId' in res:
                    #     print(res['keggCompoundId'])
            return c
        except Exception as e:
            print("Exception while processing drill results", e, sql)
            logger.error(sql)
            logger.error("Exception while processing results:" + str(e))
            import traceback
            traceback.print_stack()
            return c
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n";
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

city="Las Vegas"

yelp_reviews = drill.query('''
  select sum(case when t.stars=1.0 then 1 else 0 end) as `1`,
  sum(case when t.stars=2.0 then 1 else 0 end) as `2`,
  sum(case when t.stars=2.5 then 1 else 0 end) as `3`,
   sum(case when t.stars=3.0 then 1 else 0 end) as `4`,
   sum(case when t.stars=3.5 then 1 else 0 end) as `5`,
   sum(case when t.stars=4.0 then 1 else 0 end) as `6`,
   sum(case when t.stars=4.5 then 1 else 0 end) as `7`,
   sum(case when t.stars=5.0 then 1 else 0 end) as `8`
   from `mongo.274_BI`.`yelp_dataset`t where t.city='Pittsburgh' and true=repeated_contains(categories,'Restaurants')

''')

print dumps(yelp_reviews)
예제 #23
0
def main(dask_client, bc):

    # Create Table Drill ------------------------------------------------------------------------------------------------------

    drill = PyDrill(host='localhost', port=8047)

    dir_data_lc = Settings.data['TestSettings']['dataDirectory']

    for x in range(5):

        # [numberOfFiles, type_nation, type_region, type_supplier, type_customer, type_lineitem, type_orders]

        run = []

        if x == 0:
            run = [1, 'psv', 'psv', 'psv', 'psv', 'psv', 'psv']
        elif x == 1:
            run = [
                2, 'parquet', 'parquet', 'parquet', 'parquet', 'parquet',
                'parquet'
            ]
        elif x == 2:
            run = [6, 'parquet', 'psv', 'parquet', 'psv', 'parquet', 'psv']
        elif x == 3:
            run = [10, 'psv', 'parquet', 'psv', 'parquet', 'psv', 'parquet']
        elif x == 4:
            run = [12, 'psv', 'psv', 'parquet', 'parquet', 'psv', 'parquet']

        print(
            "======================================================================="
        )
        print("Running " + str(x + 1) + ":")
        print("Número de Archivos: " + str(run[0]))
        print("Type of files for Nation: " + run[1])
        print("Type of files for Region: " + run[2])
        print("Type of files for Supplier: " + run[3])
        print("Type of files for Customer: " + run[4])
        print("Type of files for Lineitem: " + run[5])
        print("Type of files for Orders: " + run[6])
        print(
            "======================================================================="
        )
        print("1")
        num_files = run[0]
        print("2")
        cs.init_drill_schema(drill, dir_data_lc, n_files=num_files)
        print("3")
        #Read Data TPCH------------------------------------------------------------------------------------------------------------
        nation_files = cs.get_filenames_table('nation', dir_data_lc, num_files,
                                              run[1])
        bc.create_table('nation',
                        nation_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('nation'),
                        names=cs.get_column_names('nation'))

        region_files = cs.get_filenames_table('region', dir_data_lc, num_files,
                                              run[2])
        bc.create_table('region',
                        region_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('region'),
                        names=cs.get_column_names('region'))

        supplier_files = cs.get_filenames_table('supplier', dir_data_lc,
                                                num_files, run[3])
        bc.create_table('supplier',
                        supplier_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('supplier'),
                        names=cs.get_column_names('supplier'))

        customer_files = cs.get_filenames_table('customer', dir_data_lc,
                                                num_files, run[4])
        bc.create_table('customer',
                        customer_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('customer'),
                        names=cs.get_column_names('customer'))

        lineitem_files = cs.get_filenames_table('lineitem', dir_data_lc,
                                                num_files, run[5])
        bc.create_table('lineitem',
                        lineitem_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('lineitem'),
                        names=cs.get_column_names('lineitem'))

        orders_files = cs.get_filenames_table('orders', dir_data_lc, num_files,
                                              run[6])
        bc.create_table('orders',
                        orders_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('orders'),
                        names=cs.get_column_names('orders'))

        #Run Query -----------------------------------------------------------------------------
        worder = 1  #Parameter to indicate if its necessary to order the resulsets before compare them
        use_percentage = False
        acceptable_difference = 0.01
        queryType = 'Load Data Test'

        print('==============================')
        print(queryType)
        print('==============================')

        queryId = 'TEST_01'
        query = "select count(c_custkey) as c1, count(c_acctbal) as c2 from customer"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_02'
        query = "select count(n_nationkey), count(n_regionkey) from nation"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_03'
        query = "select count(s_suppkey), count(s_nationkey) from supplier"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_04'
        query = "select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_05'
        query = "select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_06'
        query = "select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal"
        runTest.run_query(bc, drill, query, queryId, queryType, 0, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_07'
        query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2"
        runTest.run_query(bc, drill, query, queryId, queryType, 0, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_08'
        query = "select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_09'
        query = "select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >=30"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_10'
        query = "select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_11'
        query = "select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_12'
        query = "select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_13'
        query = "select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_14'
        query = "select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey))/(sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_15'
        query = "select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_16'
        query = "select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_17'
        query = "select count(o_orderstatus) from orders where o_orderstatus <> 'O'"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_18'
        query = "select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_19'
        query = "select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_20'
        query = "select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_21'
        query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ),
        nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey)
        select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey"""
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_22'
        query = """select o.o_totalprice, l.l_partkey from orders as o
            left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        queryId = 'TEST_23'
        query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
            inner join orders as o on o.o_orderkey = l.l_orderkey
            inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        queryId = 'TEST_24'
        query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o
            full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        runTest.save_log()