def cli(dbname, host, port, timeout, interactive, version): """Cli entry point.""" if version or dbname == 'version': print('datacli version: {}'.format(__version__)) sys.exit(0) configure_logger() conn = PyDrill(host=host, port=port) if not conn.is_active(): log.error('unable to reach Drill server') return 1 cli = DataCli(conn, dbname, DataPrompt(), timeout=timeout) log.info('connected to Drillbit') while True: try: should_exit = cli.repl(interactive) if should_exit: break except KeyboardInterrupt: break # Control-C pressed except EOFError: break # Control-D pressed log.info('shutting down...') return 0
def get_pydrill_conn(self, _connection_conf): if not PYDRILL_AVAILABLE: raise ImproperlyConfigured('pydrill must be installed.') conn = PyDrill(**_connection_conf) class cursor: def __init__(self): self.data = [] self.description = '' self.cursor_index = -1 def execute(self, sql): self.data = conn.query(sql) return self.data def fetchone(self, *args, **kwargs): self.cursor_index += 1 try: self.description = list(self.data.rows[self.cursor_index].items()) return self.description except IndexError: return def close(self): return conn.cursor = cursor conn.commit = lambda: None return conn
class TestPydrill(unittest.TestCase): def setUp(self): self.drill = PyDrill(host='localhost', port=8047) def test_transport_host(self): assert self.drill.transport.host == 'localhost' def test_transport_port(self): assert self.drill.transport.port == 8047 @responses.activate def test_is_active(self): responses.add(**{ 'method': responses.HEAD, 'url': 'http://localhost:8047/', 'status': 200, 'content_type': 'application/json', }) assert self.drill.is_active() == True @responses.activate def test_is_not_active_404(self): responses.add(**{ 'method': responses.HEAD, 'url': 'http://localhost:8047/', 'content_type': 'application/json', 'status': 404, }) assert self.drill.is_active() == False @responses.activate def test_is_not_active_500(self): responses.add(**{ 'method': responses.HEAD, 'url': 'http://localhost:8047/', 'content_type': 'application/json', 'status': 500, }) assert self.drill.is_active() == False @responses.activate def test_is_not_active_timeout(self): responses.add(**{ 'method': responses.HEAD, 'url': 'http://localhost:8047/', 'content_type': 'application/json', 'status': 500, }) try: self.drill.perform_request('HEAD', '/', params={'request_timeout': 0}) except TransportError as e: assert e.status_code == e.args[0] assert e.error == e.args[1] assert e.info == e.args[2] assert str(e) else: assert False
def run_query(self, query, user): drillbit_host, drillbit_port = self.get_drillbit( self.configuration.get('host', None), self.configuration.get('port', None), self.configuration.get('zookeeper_path', None)) user_auth = self.configuration.get('user_auth', None) if user_auth: session = requests.Session() # Create a session object username, password = user_auth.split(':') if not self.auth_drill(session, drillbit_host, drillbit_port, username, password): json_data = None error = 'Invalid credentials for Drill' return json_data, error connection = PyDrill(host=drillbit_host, port=drillbit_port, connection_class=DrillRequestsHttpConnection, drill_session=session) else: connection = PyDrill(host=drillbit_host, port=drillbit_port) if not connection.is_active(): json_data = None error = 'Please run Drill first' return json_data, error annotation = self.get_annotation(query) try: result = None for q in self.strip_comments(query).split(';'): q = q.strip() if not q: continue q = self.magic_helpers(q) q = annotation + q result = connection.query(q, timeout=600) print(result.rows) logger.info(result.rows) columns = [] for col in result.columns: columns.append({'name': col, 'friendly_name': col, 'type': TYPE_STRING}) rows = result.rows data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except TransportError as te: json_data = None error = drillbit_host + '\n' + te.error except Exception as ex: json_data = None error = drillbit_host + '\n' + str(ex) return json_data, error
class TestPydrill(unittest.TestCase): def setUp(self): self.drill = PyDrill(host='localhost', port=8047) @responses.activate def test_plan_for_select_employee_mocked(self): sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1" responses.add( **{ 'method': responses.POST, 'url': 'http://localhost:8047/query.json', 'body': '{"queryType": "SQL","query": "explain plan for %(sql)s"}' % ({ 'sql': sql }), 'status': 200, 'content_type': 'application/json', 'json': {}, }) result = self.drill.plan(sql=sql) assert result.response.status_code == 200
def connection(self): from pydrill.client import PyDrill if self._connection is None: self._connection = PyDrill(**self.params) return self._connection
def test_authentication_success(pydrill_url): responses.add( **{ 'method': responses.POST, 'url': "{0}/{1}".format(pydrill_url, 'j_security_check'), }) PyDrill(auth='user:password')
def init_drill_connection(self): self.__drill = PyDrill(host='localhost') is_drill_active = self.__drill.is_active() if is_drill_active: print("Drill is active: %s" % is_drill_active) else: print(''' Drill is not active. Start your server in a terminal using command: cd /Users/Aymeric/apache-drill-1.5.0 bin/drill-embedded ''')
def main(): print("**init end2end**") Execution.getArgs() dir_data_file = Settings.data["TestSettings"]["dataDirectory"] # Create Table Drill ----------------------------------------- drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema(drill, dir_data_file) # Sólo pasan todos los test con 100Mb csvFromLocalTest.main(drill, dir_data_file) csvFromS3Test.main( drill, dir_data_file ) # AttributeError: 'NoneType' object has no attribute '_cols' # vector::_M_range_check: __n # (which is 18446744073709551615) >= this->size() (which is 2) csvFromHdfsTest.main( drill, dir_data_file ) parquetFromLocalTest.main( drill, dir_data_file ) # Sólo pasan todos los test con 100Mb # Pasan todos los test con 100Mb, con multiples archivos para # una tabla no porque no se carga bien todos los archivos. parquetFromS3Test.main( drill, dir_data_file ) parquetFromHdfsTest.main( drill, dir_data_file ) # Se queda pensando en la lectura de data runTest.save_log() for i in range(0, len(Settings.memory_list)): print( Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta) )
def init_drill(): # Start Drill schema----------------------------------------- from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) return drill
class TestPydrill(unittest.TestCase): def setUp(self): self.drill = PyDrill(host='localhost', port=8047) @responses.activate def test_select_employee_mocked(self): sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1" expected_result = { "columns": ["employee_id", "full_name", "first_name", "last_name", "position_id", "position_title", "store_id", "department_id", "birth_date", "hire_date", "salary", "supervisor_id", "education_level", "marital_status", "gender", "management_role"], "rows": [{ "hire_date": "1994-12-01 00:00:00.0", "birth_date": "1961-08-26", "department_id": "1", "store_id": "0", "education_level": "Graduate Degree", "first_name": "Sheri", "position_id": "1", "management_role": "Senior Management", "last_name": "Nowmer", "gender": "F", "position_title": "President", "marital_status": "S", "salary": "80000.0", "employee_id": "1", "supervisor_id": "0", "full_name": "Sheri Nowmer" }] } responses.add(**{ 'method': responses.POST, 'url': 'http://localhost:8047/query.json', 'body': '{"queryType": "SQL","query": "%(sql)s"}' % ({'sql': sql}), 'status': 200, 'content_type': 'application/json', 'json': expected_result, }) result = self.drill.query(sql=sql) assert result.response.status_code == 200 assert result.data == expected_result
class TestPydrill(unittest.TestCase): def setUp(self): self.drill = PyDrill(host='localhost', port=8047) @responses.activate def test_plan_for_select_employee_mocked(self): sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1" responses.add(**{ 'method': responses.POST, 'url': 'http://localhost:8047/query.json', 'body': '{"queryType": "SQL","query": "explain plan for %(sql)s"}' % ({'sql': sql}), 'status': 200, 'content_type': 'application/json', 'json': {}, }) result = self.drill.plan(sql=sql) assert result.response.status_code == 200
def main(): print('**init performance test**') Execution.getArgs() dir_data_file = Settings.data['TestSettings']['dataDirectory'] # Create Table Drill ------------------------------------------------------------------------------------------------------ drill = PyDrill(host='localhost', port=8047) createSchema.init_drill_schema(drill, dir_data_file) jobId = 1 if Settings.data['MysqlConnection']['connectEnabled']: from DataBase import mysqlDatabaseManager as msqldb jobId = msqldb.getJobId() for x in range(0, 10): performanceTest.main(drill, dir_data_file) runTest.save_log(job_id=jobId)
def main(dask_client, bc): # Create Table Drill ------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) dir_data_lc = Settings.data["TestSettings"]["dataDirectory"] for x in range(5): # [numberOfFiles, type_nation, type_region, type_supplier, # type_customer, type_lineitem, type_orders] run = [] if x == 0: run = [1, "psv", "psv", "psv", "psv", "psv", "psv"] elif x == 1: run = [ 2, "parquet", "parquet", "parquet", "parquet", "parquet", "parquet" ] elif x == 2: run = [6, "parquet", "psv", "parquet", "psv", "parquet", "psv"] elif x == 3: run = [10, "psv", "parquet", "psv", "parquet", "psv", "parquet"] elif x == 4: run = [12, "psv", "psv", "parquet", "parquet", "psv", "parquet"] print("============================================================") print("Running " + str(x + 1) + ":") print("Número de Archivos: " + str(run[0])) print("Type of files for Nation: " + run[1]) print("Type of files for Region: " + run[2]) print("Type of files for Supplier: " + run[3]) print("Type of files for Customer: " + run[4]) print("Type of files for Lineitem: " + run[5]) print("Type of files for Orders: " + run[6]) print("============================================================") print("1") num_files = run[0] print("2") cs.init_drill_schema(drill, dir_data_lc, n_files=num_files) print("3") # Read Data TPCH----------------------------------------------------- nation_files = cs.get_filenames_table("nation", dir_data_lc, num_files, run[1]) bc.create_table( "nation", nation_files, delimiter="|", dtype=cs.get_dtypes("nation"), names=cs.get_column_names("nation"), ) region_files = cs.get_filenames_table("region", dir_data_lc, num_files, run[2]) bc.create_table( "region", region_files, delimiter="|", dtype=cs.get_dtypes("region"), names=cs.get_column_names("region"), ) supplier_files = cs.get_filenames_table("supplier", dir_data_lc, num_files, run[3]) bc.create_table( "supplier", supplier_files, delimiter="|", dtype=cs.get_dtypes("supplier"), names=cs.get_column_names("supplier"), ) customer_files = cs.get_filenames_table("customer", dir_data_lc, num_files, run[4]) bc.create_table( "customer", customer_files, delimiter="|", dtype=cs.get_dtypes("customer"), names=cs.get_column_names("customer"), ) lineitem_files = cs.get_filenames_table("lineitem", dir_data_lc, num_files, run[5]) bc.create_table( "lineitem", lineitem_files, delimiter="|", dtype=cs.get_dtypes("lineitem"), names=cs.get_column_names("lineitem"), ) orders_files = cs.get_filenames_table("orders", dir_data_lc, num_files, run[6]) bc.create_table( "orders", orders_files, delimiter="|", dtype=cs.get_dtypes("orders"), names=cs.get_column_names("orders"), ) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 queryType = "Load Data Test" print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey) as c1, count(c_acctbal) as c2 from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_02" query = "select count(n_nationkey), count(n_regionkey) from nation" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_03" query = "select count(s_suppkey), count(s_nationkey) from supplier" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_04" query = """select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_05" query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_06" query = """select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_07" query = """select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_08" query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_09" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_10" query = """select sin(c_acctbal), cos(c_acctbal), sin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_11" query = """select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_12" query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_13" query = """select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_14" query = """select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984 / sum(o_totalprice), (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey) / count(81619/o_orderkey)) from orders""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_15" query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_16" query = """select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_17" query = """select count(o_orderstatus) from orders where o_orderstatus <> 'O'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_18" query = """select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_19" query = """select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_20" query = """select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_21" query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_22" query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_23" query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_24" query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000""" runTest.run_query_performance( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) runTest.save_log()
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') yelp_reviews = drill.query(''' SELECT sum(case when t.attributes.Ambience.romantic='true' then 1 else 0 end) as Romantic, sum(case when t.attributes.Ambience.intimate='true' then 1 else 0 end) as Intimate, sum(case when t.attributes.Ambience.classy='true' then 1 else 0 end) as Classy, sum(case when t.attributes.Ambience.hipster='true' then 1 else 0 end) as Hipster, sum(case when t.attributes.Ambience.divey='true' then 1 else 0 end) as Divey, sum(case when t.attributes.Ambience.touristy='true' then 1 else 0 end) as Touristy, sum(case when t.attributes.Ambience.trendy='true' then 1 else 0 end) as Trendy, sum(case when t.attributes.Ambience.upscale='true' then 1 else 0 end) as Upscale, sum(case when t.attributes.Ambience.casual='true' then 1 else 0 end) as Casual from `mongo.274_BI`.`yelp_dataset`t where true=repeated_contains(categories,'American')and t.stars>3.5 and t.city='Phoenix' ''') print dumps(yelp_reviews)
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city = "Las Vegas" yelp_reviews = drill.query(''' select sum(case when t.attributes.`Price Range`=1 then 1 else 0 end) as One, sum(case when t.attributes.`Price Range`=2 then 1 else 0 end) as Two, sum(case when t.attributes.`Price Range`=3 then 1 else 0 end) as Three , sum(case when t.attributes.`Price Range`=4 then 1 else 0 end) as Four from `mongo.274_BI`.`yelp_dataset` t where t.city='Phoenix' and true=repeated_contains(categories,'Restaurants') order by count(*) asc ''') print dumps(yelp_reviews) # pandas dataframe
# IMPORTS ######################################################################################################################## import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Output, Event from flask import Flask import os from pydrill.client import PyDrill from tb.modules.trending_brand_query_engine.query_engine import tweets_per_minute,sentiment_query_engine, last_message_form_kafka from kafka import KafkaConsumer ######################################################################################################################## consumer = KafkaConsumer('beer', group_id='ui2') drill1 = PyDrill(host='localhost', port=8047) drill2 = PyDrill(host='localhost', port=8047) server = Flask('my app') server.secret_key = os.environ.get('secret_key', 'secret') app = dash.Dash('streaming-wind-app', server=server, url_base_pathname='/BEER/', csrf_protect=False) app.layout = html.Div([ html.Div([ html.H2("Trending-Brand: Beer consumption"), ], className='banner'), html.Div([ html.Div([
def main(): print("**init end2end**") Execution.getArgs() nvmlInit() dir_data_file = Settings.data["TestSettings"]["dataDirectory"] nRals = Settings.data["RunSettings"]["nRals"] drill = "drill" spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ----------------------------------------- from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True) createSchema.init_drill_schema( drill, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) # Create Table Spark ------------------------------------------------- from pyspark.sql import SparkSession spark = SparkSession.builder.appName("allE2ETest").getOrCreate() createSchema.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"]) createSchema.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"], smiles_test=True, fileSchemaType=DataType.PARQUET) targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"] # only innerJoinsTest will be with progress bar useProgressBar = False if "innerJoinsTest" in targetTestGroups: useProgressBar = True print("Using progress bar: ", useProgressBar) # Create Context For BlazingSQL bc, dask_client = init_context(useProgressBar=useProgressBar) runAllTests = ( len(targetTestGroups) == 0 ) # if targetTestGroups was empty the user wants to run all the tests if runAllTests or ("hiveFileTest" in targetTestGroups): hiveFileTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups): aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("coalesceTest" in targetTestGroups): coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("columnBasisTest" in targetTestGroups): columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("commonTableExpressionsTest" in targetTestGroups): commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("countDistinctTest" in targetTestGroups): countDistinctTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("countWithoutGroupByTest" in targetTestGroups): countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("dateTest" in targetTestGroups): dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("timestampTest" in targetTestGroups): timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("toTimestampTest" in targetTestGroups): toTimestampTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("dayOfWeekTest" in targetTestGroups): dayOfWeekTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("fullOuterJoinsTest" in targetTestGroups): fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("groupByTest" in targetTestGroups): groupByTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups): GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("innerJoinsTest" in targetTestGroups): innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("crossJoinsTest" in targetTestGroups): crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals) if runAllTests or ("leftOuterJoinsTest" in targetTestGroups): leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("nonEquiJoinsTest" in targetTestGroups): nonEquiJoinsTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) # loadDataTest.main(dask_client, bc) #check this if runAllTests or ("nestedQueriesTest" in targetTestGroups): nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("orderbyTest" in targetTestGroups): orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("predicatesWithNulls" in targetTestGroups): predicatesWithNulls.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("stringTests" in targetTestGroups): stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("tablesFromPandasTest" in targetTestGroups): tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unaryOpsTest" in targetTestGroups): unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unifyTablesTest" in targetTestGroups): unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("unionTest" in targetTestGroups): unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("useLimitTest" in targetTestGroups): useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("whereClauseTest" in targetTestGroups): whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("bindableAliasTest" in targetTestGroups): bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("booleanTest" in targetTestGroups): booleanTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("caseTest" in targetTestGroups): caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("castTest" in targetTestGroups): castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("concatTest" in targetTestGroups): concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("literalTest" in targetTestGroups): literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("dirTest" in targetTestGroups): dirTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) # HDFS is not working yet # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc) # HDFS is not working yet # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc) if runAllTests or ("likeTest" in targetTestGroups): likeTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("substringTest" in targetTestGroups): substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("stringCaseTest" in targetTestGroups): stringCaseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("wildCardTest" in targetTestGroups): wildCardTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("tpchQueriesTest" in targetTestGroups): tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("roundTest" in targetTestGroups): roundTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemLocalTest" in targetTestGroups): fileSystemLocalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("messageValidationTest" in targetTestGroups): messageValidationTest.main(dask_client, drill, dir_data_file, bc, nRals) testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"] if testsWithNulls != "true": if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("fileSystemS3Test" in targetTestGroups): fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("fileSystemGSTest" in targetTestGroups): fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("loggingTest" in targetTestGroups): loggingTest.main(dask_client, dir_data_file, bc, nRals) # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals) #TODO re enable this test once we have the new version of dask # https://github.com/dask/distributed/issues/4645 # https://github.com/rapidsai/cudf/issues/7773 #if runAllTests or ("smilesTest" in targetTestGroups): # smilesTest.main(dask_client, spark, dir_data_file, bc, nRals) if testsWithNulls != "true": if runAllTests or ("jsonTest" in targetTestGroups): jsonTest.main(dask_client, drill, dir_data_file, bc, nRals) if runAllTests or ("windowFunctionTest" in targetTestGroups): windowFunctionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if runAllTests or ("windowNoPartitionTest" in targetTestGroups): windowNoPartitionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if testsWithNulls != "true": if runAllTests or ("concurrentTest" in targetTestGroups): concurrentTest.main(dask_client, drill, dir_data_file, bc, nRals) if testsWithNulls == "true": if Settings.execution_mode != ExecutionMode.GPUCI: if runAllTests or ("tablesFromSQL" in targetTestGroups): tablesFromSQL.main(dask_client, drill, dir_data_file, bc, nRals) # WARNING!!! This Test must be the last one to test ------------------------------------------------------------------------------------------------------------------------------------------- if runAllTests or ("configOptionsTest" in targetTestGroups): configOptionsTest.main(dask_client, drill, spark, dir_data_file, bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR: result, error_msgs = runTest.save_log( Settings.execution_mode == ExecutionMode.GPUCI) max = 0 for i in range(0, len(Settings.memory_list)): if (Settings.memory_list[i].delta) > max: max = Settings.memory_list[i].delta print("MAX DELTA: " + str(max)) print("""*********************************************************** ********************""") for i in range(0, len(Settings.memory_list)): print(Settings.memory_list[i].name + ":" + " Start Mem: " + str(Settings.memory_list[i].start_mem) + " End Mem: " + str(Settings.memory_list[i].end_mem) + " Diff: " + str(Settings.memory_list[i].delta)) return result, error_msgs return True, []
nvmlInit() drill = "drill" # None spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if ((Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR): # Create Table Drill ------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) cs.init_drill_schema(drill, Settings.data["TestSettings"]["dataDirectory"]) # Create Table Spark ------------------------------------------------- from pyspark.sql import SparkSession spark = SparkSession.builder.appName("timestampTest").getOrCreate() cs.init_spark_schema(spark, Settings.data["TestSettings"]["dataDirectory"]) # Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data["RunSettings"]["nRals"]
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') yelp_reviews = drill.query(''' select avg(t.latitude) as latitude, avg(t.longitude) as longitude from `mongo.274_BI`.`yelp_dataset`t where true=repeated_contains(categories,'Fast Food')and t.stars>3.5 and t.city='Pittsburgh' ''') print dumps(yelp_reviews) # pandas dataframe
def setUp(self): self.drill = PyDrill(host='localhost', port=8047)
#!/usr/local/bin/python from pydrill.client import PyDrill import json import cgi import sys from bson.json_util import dumps from bson import json_util print "Content-type: application/json\n\n" #print "Content-type: application/json\n\n"; #print "Content-type: text/html\n\n"; #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city = cgi.FieldStorage().value #print [city] city1 = json.loads(city) query = "select sum(case when t.attributes.Ambience.romantic='true' then 1 else 0 end) as Romantic," + "sum(case when t.attributes.Ambience.intimate='true' then 1 else 0 end) as Intimate," + "sum(case when t.attributes.Ambience.classy='true' then 1 else 0 end) as Classy," + "sum(case when t.attributes.Ambience.hipster='true' then 1 else 0 end) as Hipster," + "sum(case when t.attributes.Ambience.divey='true' then 1 else 0 end) as Divey," + "sum(case when t.attributes.Ambience.touristy='true' then 1 else 0 end) as Touristy," + "sum(case when t.attributes.Ambience.trendy='true' then 1 else 0 end) as Trendy," + "sum(case when t.attributes.Ambience.upscale='true' then 1 else 0 end) as Upscale," + "sum(case when t.attributes.Ambience.casual='true' then 1 else 0 end) as Casual " + "from `mongo.274_BI`.`yelp_dataset`t where true=repeated_contains(categories,'" + city1[ "data"] + "')and t.stars>3.5 and t.city='" + city1["data2"] + "'" #print query yelp_reviews = drill.query(query) print dumps(yelp_reviews)
#!/usr/bin/env python from pydrill.client import PyDrill drill = PyDrill(host='10.32.48.136', port=48047, auth="jlim:2019Jfm!", use_ssl=False, verify_certs=False) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') tenants = drill.query(''' SELECT * FROM dfs.`/tsys/qa/internal/data/maprdb/tenants` LIMIT 5 ''') for result in tenants: print result
def pydrill_instance(): drill = PyDrill() return drill
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') yelp_reviews = drill.query(''' SELECT city , count(*) totalreviews FROM `mongo.274_BI`.`yelp_dataset` where true=repeated_contains(categories,'Restaurants') group by city order by count(*) desc limit 5 ''') print dumps(yelp_reviews) # pandas dataframe
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city = "Las Vegas" yelp_reviews = drill.query(''' SELECT sum(case when t.attributes.Parking.garage='true' then 1 else 0 end) as Garage, sum(case when t.attributes.Parking.street='true' then 1 else 0 end) as Street, sum(case when t.attributes.Parking.validated='true' then 1 else 0 end) as Validated, sum(case when t.attributes.Parking.valet='true' then 1 else 0 end) as Valet, sum(case when t.attributes.Parking.lot='true' then 1 else 0 end) as Lot FROM `mongo.274_BI`.`yelp_dataset`t where t.city = 'Charlotte' and true=repeated_contains(categories,'Restaurants') ''') print dumps(yelp_reviews) # pandas dataframe
def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None): c = 0 try: if not self.drill.is_active(): try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill for query processing", ex) return 0 try: results = self.drill.query(sql, timeout=1000) except Exception as ex: print("Exception while running query to Drill for query processing", ex) return 0 for row in results: c += 1 # if res_dict is not None: # rowtxt = ",".join(list(row.values())) # if rowtxt in res_dict: # continue # else: # res_dict.append(rowtxt) res = {} skip = False for r in row: if row[r] == 'null': skip = True break if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] if 'http://' in row[r]: res[s] = row[r] else: res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: if 'http://' in r: res[s] = r else: res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: if 'http://' in row[r]: res[r] = row[r] else: res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] if not skip: queue.put(res) # if 'keggCompoundId' in res: # print(res['keggCompoundId']) return c except Exception as e: print("Exception while processing drill results", e, sql) logger.error(sql) logger.error("Exception while processing results:" + str(e)) import traceback traceback.print_stack() return c
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') yelp_reviews = drill.query(''' SELECT sum(case when t.attributes.`Alcohol`<>'none' then 1 else 0 end) as Alcohol, sum(case when t.attributes.`Good For Groups`='true' then 1 else 0 end) as `Good For Groups`, sum(case when t.attributes.`Accepts Credit Cards`='true' then 1 else 0 end) as `Accept Credit Cards` , sum(case when t.attributes.`Wi-Fi`='free' then 1 else 0 end) as `Wi-Fi`, sum(case when t.attributes.`Take-out`='true' then 1 else 0 end) as `Take-Out`, sum(case when t.attributes.`Coat Check`='true' then 1 else 0 end) as `Coat Check`, sum(case when t.attributes.`Takes Reservations`='true' then 1 else 0 end) as `Takes Reservations`, sum(case when t.attributes.`Wheelchair Accessible`='true' then 1 else 0 end) as `Wheelchair Accessible`, sum(case when t.attributes.`Outdoor Seating`='true' then 1 else 0 end) as `Outdoor Seating`, sum(case when t.attributes.`Dogs Allowed`='true' then 1 else 0 end) as `Dogs Allowed`, sum(case when t.attributes.`Smoking`='outdoor' then 1 else 0 end) as `Smoking`, sum(case when t.attributes.`Waiter Service`='true' then 1 else 0 end) as `Waiter Service`, sum(case when t.attributes.`Caters`='true' then 1 else 0 end) as `Caters`, sum(case when t.attributes.`Has TV`='true' then 1 else 0 end) as `Has TV`,
class DrillWrapper(object): def __init__(self, datasource, config, rdfmts, star): self.datasource = datasource self.rdfmts = rdfmts self.url = datasource.url self.params = datasource.params self.config = config self.drill = None self.df = None self.result = None self.star = star self.query = None self.prefixes = {} if ':' in self.url: self.host, self.port = self.url.split(':') else: self.host = self.url self.port = '8047' if len(self.datasource.mappings) == 0: self.mappings = self.config.load_mappings(self.datasource.mappingfiles, self.rdfmts) else: # self.mappings = self.config.mappings self.mappings = self.datasource.mappings def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time # start = time() # print("Start:", start) if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)] if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters) # print(sqlquery) # totalres = 0 if sqlquery is None or len(sqlquery) == 0: queue.put("EOF") return [] try: start = time() try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill", ex) queue.put("EOF") return if not self.drill.is_active(): print('Exception: Please run Drill first') queue.put("EOF") return # print("Drill Initialization cost:", time() - start) logger.info("Drill Initialization cost:" + str(time() - start)) start = time() if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] if len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] # logger.info(" UNION ".join(sqlquery)) processqueues = [] processes = [] res_dict = [] for sql in sqlquery: # processquery = Queue() # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict) # print(sql) processquery = Queue() processqueues.append(processquery) p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,)) p.start() processes.append(p) while len(processqueues) > 0: toremove = [] try: for q in processqueues: if q.get(False) == 'EOF': toremove.append(q) for p in processes: if p.is_alive(): p.terminate() except: pass for q in toremove: processqueues.remove(q) logger.info("Done running:") sw = " UNION ".join(sqlquery) logger.info(sw) else: card = 0 # if limit == -1: limit = 1000 if offset == -1: offset = 0 logger.info(sqlquery) # print(sqlquery) while True: query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset) cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates) card += cardinality if cardinality < limit: break offset = offset + limit # print("Exec in Drill took:", time() - start) logger.info("Exec in Drill took:" + str(time() - start)) except Exception as e: print("Exception ", e) pass # print('End:', time(), "Total results:", totalres) # print("Drill finished after: ", (time()-start)) queue.put("EOF") def run_union(self, sql, queue, projvartocols, coltotemplates, limit, processqueue, res_dict): card = 0 # if limit == -1: limit = 1000 offset = 0 while True: query_copy = sql + " LIMIT " + str(limit) + " OFFSET " + str(offset) cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates, res_dict) card += cardinality if cardinality < limit: break offset = offset + limit processqueue.put("EOF") def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None): c = 0 try: if not self.drill.is_active(): try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill for query processing", ex) return 0 try: results = self.drill.query(sql, timeout=1000) except Exception as ex: print("Exception while running query to Drill for query processing", ex) return 0 for row in results: c += 1 # if res_dict is not None: # rowtxt = ",".join(list(row.values())) # if rowtxt in res_dict: # continue # else: # res_dict.append(rowtxt) res = {} skip = False for r in row: if row[r] == 'null': skip = True break if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] if 'http://' in row[r]: res[s] = row[r] else: res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: if 'http://' in r: res[s] = r else: res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: if 'http://' in row[r]: res[r] = row[r] else: res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] if not skip: queue.put(res) # if 'keggCompoundId' in res: # print(res['keggCompoundId']) return c except Exception as e: print("Exception while processing drill results", e, sql) logger.error(sql) logger.error("Exception while processing results:" + str(e)) import traceback traceback.print_stack() return c def get_so_variables(self, triples, proj): tvars = [] for t in triples: if not t.subject.constant: tvars.append(t.subject.name) # exclude variables that are not projected if not t.theobject.constant:# and t.theobject.name in proj: tvars.append(t.theobject.name) return tvars def getsqlfil(self, l, r, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias): if r is not None and '?' in r.name: var = r.name val = l.name else: var = l.name val = r.name # print(val) if '(' in var and ')' in var: var = var[var.find('(') + 1:var.find(')')] if len(var_pred_map) == 0 or var == self.star['triples'][0].subject.name: subjcol = subjmap.value splits = subjcol.split('{') coltotemplates[var[1:]] = subjcol column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) if len(column) > 1: objfilters = [] for col in column: vcolumn = "`" + col + '`' if '<' in val and '>' in val: val = val.replace('<', '').replace('>', '') if '"' not in val and "'" not in val: val = "'" + val + "'" if op == 'REGEX': val = "LOWER('%" + val[1:-1] + "%')" objectfilter = 'LOWER(' + tablealias + '.' + vcolumn + ") LIKE " + val else: objectfilter = tablealias + '.' + vcolumn + op + val objfilters.append(objectfilter) return " AND ".join(objfilters) elif len(column) == 1: column = "`" + column[0] + '`' if '<' in val and '>' in val: val = val.replace('<', '').replace('>', '').replace(splits[0], '') if '"' not in val and "'" not in val: val = "'" + val + "'" if op == 'REGEX': val = "LOWER('%" + val[1:-1] + "%')" objectfilter = 'LOWER(' + tablealias + '.' + column + ") LIKE " + val else: objectfilter = tablealias + '.' + column + op + val return objectfilter if var not in var_pred_map: return None p = var_pred_map[var] pmap, omap = predicate_object_map[p] if omap.objectt.resource_type == TripleMapType.TEMPLATE: coltotemplates[var[1:]] = omap.objectt.value splits = omap.objectt.value.split('{') column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) val = val.replace(splits[0], "") if len(column) == 1: column = column[0] elif omap.objectt.resource_type == TripleMapType.REFERENCE: column = omap.objectt.value else: column = [] if isinstance(column, list): if len(column) > 0: column = column[0] column = "`" + column + '`' if '<' in val and '>' in val: val = val.replace('<', '').replace('>', '') if '"' not in val and "'" not in val: val = "'" + val + "'" if op == 'REGEX': val = "LOWER('%" + val[1:-1] + "%')" objectfilter = 'LOWER(' + tablealias + '.' + column + ") LIKE " + val else: objectfilter = tablealias + '.' + column + op + val return objectfilter def get_Expression_value(self, exp, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias): left = exp.left right = exp.right op = exp.op if op in unaryFunctor: if isinstance(left, Expression) and isinstance(left.left, Argument): left = left.left fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) return fil elif op in binaryFunctor: if op == 'REGEX' and right.desc is not False: if isinstance(left, Expression): if 'xsd:string' in left.op: left = left.left fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) return fil # else: # left = self.get_Expression_value(left, var_pred_map, subjmap,predicate_object_map, coltotemplates, tablealias) # right = self.get_Expression_value(right, var_pred_map, subjmap, predicate_object_map, coltotemplates,tablealias) else: fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) return fil # return op + "(" + str(left) + "," + right.name + "," + right.desc + ")" else: return op + "(" + str(left) + "," + str(right) + ")" elif right is None: return op + str(left) else: if isinstance(left, Argument) and isinstance(right, Argument): fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) return fil if isinstance(left, Expression) and isinstance(right, Expression): leftexp = self.get_Expression_value(left, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) rightexp = self.get_Expression_value(right, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) if op == '||' or op == '|': if leftexp is None or rightexp is None: return None return '(' + leftexp + ' OR ' + rightexp + ')' else: if leftexp is None or rightexp is None: return None return '(' + leftexp + ' AND ' + rightexp + ')' # print(op, type(left), left, type(right), right) return "(" + str(exp.left) + " " + exp.op + " " + str(exp.right) def get_obj_filter(self, f, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias): return self.get_Expression_value(f.expr, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias) def makeJoin(self, mapping_preds, query_filters): coltotemplates = {} projections = {} projvartocol = {} objectfilters = [] constfilters = [] fromclauses = [] database_name = "" i = 0 tm_tablealias = {} subjects = {} projvartocols = {} query = "" for tm, predicate_object_map in mapping_preds.items(): sparqlprojected = set(self.get_so_variables(self.star['triples'], [c.name for c in self.query.args])) tablealias = 'Ontario_' + str(i) if isinstance(predicate_object_map, SubjectMap): subjvar = self.star['triples'][0].subject.name var = subjvar var_pred_map = {subjvar: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'} if predicate_object_map.subject.resource_type == TripleMapType.TEMPLATE: coltotemplates[subjvar[1:]] = predicate_object_map.subject.value splits = predicate_object_map.subject.value.split('{') column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) if len(column) == 1: column = column[0] elif predicate_object_map.subject.resource_type == TripleMapType.REFERENCE: column = predicate_object_map.subject.value else: column = [] if isinstance(column, list): j = 0 for col in column: if '[*]' in col: col = col.replace('[*]', '') vcolumn = "`" + col + '`' projections[var[1:] + '_Ontario_' + str(j)] = "FLATTEN(" + tablealias + "." + vcolumn + ") AS " + var[1:] + '_Ontario_' + str(j) else: vcolumn = "`" + col + '`' projections[var[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS " + var[1:] + '_Ontario_' + str(j) projvartocol.setdefault(var[1:], []).append(col) objectfilters.append(tablealias + '.' + vcolumn + " is not null ") objectfilters.append(tablealias + '.' + vcolumn + " <> '' ") j += 1 else: col = column if '[*]' in col: col = col.replace('[*]', '') column = "`" + col + '`' projections[var[1:]] = "FLATTEN(" + tablealias + "." + column + ") AS `" + var[1:] + '`' else: column = "`" + column + '`' projections[var[1:]] = tablealias + "." + column + " AS `" + var[1:] + '`' projvartocol[var[1:]] = col objectfilters.append(tablealias + '.' + column + " is not null ") objectfilters.append(tablealias + '.' + column + " <> '' ") else: var_pred_map = {var: pred for pred, var in self.star['predicates'].items() if pred in predicate_object_map} column = [] for var in sparqlprojected: if var not in var_pred_map: continue p = var_pred_map[var] pmap, omap = predicate_object_map[p] if omap.objectt.resource_type == TripleMapType.TEMPLATE: coltotemplates[var[1:]] = omap.objectt.value splits = omap.objectt.value.split('{') column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) if len(column) == 1: column = column[0] elif omap.objectt.resource_type == TripleMapType.REFERENCE: column = omap.objectt.value else: column = [] if isinstance(column, list): j = 0 for col in column: if '[*]' in col: col = col.replace('[*]', '') vcolumn = "`" + col + '`' projections[var[1:] + '_Ontario_' + str(j)] = "FLATTEN(" + tablealias + "." + vcolumn + ") AS " + var[1:] + '_Ontario_' + str(j) else: vcolumn = "`" + col + '`' projections[var[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS `" + var[1:] + '_Ontario_' + str(j) + '`' projvartocol.setdefault(var[1:], []).append(col) objectfilters.append(tablealias + '.' + vcolumn + " is not null ") objectfilters.append(tablealias + '.' + vcolumn + " <> '' ") j += 1 else: col = column if '[*]' in col: col = col.replace('[*]', '') column = "`" + col + '`' projections[var[1:]] = "FLATTEN(" + tablealias + "." + column + ") AS `" + var[1:] + '`' else: column = "`" + column + '`' projections[var[1:]] = tablealias + "." + column + " AS `" + var[1:] + '`' projvartocol[var[1:]] = col objectfilters.append(tablealias + '.' + column + " is not null ") objectfilters.append(tablealias + '.' + column + " <> '' ") for f in query_filters: #if len(set(f.getVars()).intersection(list(var_pred_map.keys()))) == len(set(f.getVars())): fil = self.get_obj_filter(f, var_pred_map, self.mappings[tm].subject_map.subject, predicate_object_map, coltotemplates, tablealias) if fil is not None and len(fil) > 0: constfilters.append(fil) tm_tablealias[tablealias] = tm triplemap = self.mappings[tm] subjects[tm] = triplemap.subject_map.subject logicalsource = triplemap.logical_source data_source = logicalsource.data_source # tablename = data_source.name # database_name = logicalsource.iterator #TODO: this is not correct, only works for LSLOD-Custom experiment database_name = data_source.name if '/' in database_name: database_name = database_name.split('/')[-1] tablename = data_source.name # TODO: change the paths, this works only for LSLOD-experiment if self.datasource.dstype == DataSourceType.LOCAL_TSV: # fileext = 'dfs.`/data/tsv/' + database_name + '/' + tablename + '.tsv`' fileext = 'dfs.`/data/tsv/' + tablename + '`' elif self.datasource.dstype == DataSourceType.LOCAL_CSV: # fileext = 'dfs.`/data/csv/' + database_name + '/' + tablename + '.csv`' fileext = 'dfs.`/data/csv/' + tablename + '`' elif self.datasource.dstype == DataSourceType.LOCAL_JSON: # fileext = 'dfs.`/data/json/' + database_name + '/' + tablename + '.json`' fileext = 'dfs.`/data/json/' + tablename + '`' elif self.datasource.dstype == DataSourceType.HADOOP_TSV: # fileext = 'hdfs.`/user/kemele/data/tsv/' + database_name + '/' + tablename + '.tsv`' fileext = 'hdfs.`/user/kemele/data/tsv/' + tablename + '`' elif self.datasource.dstype == DataSourceType.HADOOP_CSV: # fileext = 'hdfs.`/user/kemele/data/csv/' + database_name + '/' + tablename + '.csv`' fileext = 'hdfs.`/user/kemele/data/csv/' + tablename + '`' elif self.datasource.dstype == DataSourceType.HADOOP_JSON: # fileext = 'hdfs.`/user/kemele/data/json/' + database_name + '/' + tablename + '.json`' fileext = 'hdfs.`/user/kemele/data/json/' + tablename + '`' else: fileext = '' fromclauses.append(fileext + ' ' + tablealias) i += 1 for var, p in var_pred_map.items(): if '?' not in var: pmap, omap = predicate_object_map[p] if omap.objectt.resource_type == TripleMapType.TEMPLATE: # omap.objectt.value splits = omap.objectt.value.split('{') column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) var = var.replace(splits[0], '').replace('}', '') if '<' in var and '>' in var: var = var[1:-1] var = "'" + var + "'" elif omap.objectt.resource_type == TripleMapType.REFERENCE: column = omap.objectt.value if "'" not in var and '"' not in var: var = "'" + var + "'" if '"' in var: var = "'" + var[1:-1] + "'" else: column = [] if isinstance(column, list): j = 0 for col in column: vcolumn = "`" + col + '`' constfilters.append(tablealias + "." + vcolumn + " = " + var) j += 1 else: column = "`" + column + '`' constfilters.append(tablealias + "." + column + " = " + var) subj = self.star['triples'][0].subject.name if not self.star['triples'][0].subject.constant else None invalidsubj = False if subj is not None: filtersadded = [] for tm, subject in subjects.items(): subjcol = subject.value tablealias = [v for v in tm_tablealias if tm_tablealias[v] == tm][0] splits = subjcol.split('{') coltotemplates[subj[1:]] = subjcol column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) if len(column) > 1: j = 0 for col in column: vcolumn = "`" + col + '`' projections[subj[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS `" + subj[1:] + '_Ontario_' + str(j) + '`' projvartocol.setdefault(subj[1:], []).append(col) objectfilters.append(tablealias + '.' + vcolumn + " is not null ") objectfilters.append(tablealias + '.' + vcolumn + " <> '' ") j += 1 elif len(column) == 1: col = column[0] column = "`" + col + '`' projections[subj[1:]] = tablealias + "." + column + " AS `" + subj[1:] + '`' projvartocol[subj[1:]] = col objectfilters.append(tablealias + '.' + column + " is not null ") objectfilters.append(tablealias + '.' + column + " <> '' ") else: subj = self.star['triples'][0].subject.name for tm, subject in subjects.items(): subjcol = subject.value tablealias = [v for v in tm_tablealias if tm_tablealias[v] == tm][0] splits = subjcol.split('{') column = [] for sp in splits[1:]: column.append(sp[:sp.find('}')]) if len(splits[0]) > 0 and splits[0] not in subj: invalidsubj = True break var = subj.replace(splits[0], '').replace('}', '') if '<' in var and '>' in var: var = var[1:-1] var = "'" + var + "'" # if isinstance(column, list): j = 0 for col in column: vcolumn = "`" + col + '`' constfilters.append(tablealias + "." + vcolumn + " = " + var) j += 1 if invalidsubj: mapping_preds = [] if len(subjects) > 1: aliases = list(tm_tablealias.keys()) raliases = aliases.copy() raliases.reverse() compared = [] for a1 in aliases: for a2 in aliases: if a1 + a2 in compared: continue if a1 == a2: continue compared.append(a1 + a2) compared.append(a2 + a1) subj1 = subjects[tm_tablealias[a1]].value subj2 = subjects[tm_tablealias[a2]].value column1 = None column2 = None splits = subj1.split('{') for sp in splits: if '}' in sp: column1 = sp[:sp.find('}')] break splits = subj2.split('{') for sp in splits: if '}' in sp: column2 = sp[:sp.find('}')] break column1 = '`' + column1 + '`' column2 = '`' + column2 + '`' if column1 == column2: objectfilters.append(a1 + '.' + column1 + "=" + a2 + "." + column2) objectfilters.extend(constfilters) if len(mapping_preds) > 0: fromcaluse = "\n FROM " + ", ".join(list(set(fromclauses))) distinct = "" if self.query.distinct: distinct = "DISTINCT " projections = " SELECT " + distinct + ", ".join(list(set(projections.values()))) if len(objectfilters) > 0: whereclause = "\n WHERE " + "\n\t AND ".join(list(set(objectfilters))) else: whereclause = "" sqlquery = projections + " " + fromcaluse + " " + whereclause return sqlquery, projvartocol, coltotemplates, database_name return query, projvartocols, coltotemplates, database_name def makeunion(self, tounions, query_filters, subjectunions=False): coltotemplates = {} projvartocols = {} database_name = "" unions = [] rdfmts = list(tounions.keys()) rdfmts = list(reversed(sorted(rdfmts))) # print(rdfmts) for rdfmt in rdfmts: mappingpreds = tounions[rdfmt] if subjectunions: for tm, submaps in mappingpreds.items(): un, projvartocols, coltotemplates, database_name = self.makeJoin({tm: submaps}, query_filters) if un is not None and len(un) > 0: unions.append(un) else: un, projvartocols, coltotemplates, database_name = self.makeJoin(mappingpreds, query_filters) if un is not None and len(un) > 0: unions.append(un) #query = " UNION ".join(unions) # print(query) return unions, projvartocols, coltotemplates, database_name def translate(self, query_filters): rdfmts = self.star['rdfmts'] starpreds = list(self.star['predicates'].keys()) star_preds = [p for p in starpreds if '?' not in p] if 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in star_preds: star_preds.remove('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') touninon = {} completematch = {} if len(star_preds) == 0: subjectonly = False for tm, triplemap in self.mappings.items(): for rdfmt in triplemap.subject_map.rdf_types: if rdfmt in rdfmts: if 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in starpreds: touninon.setdefault(rdfmt, {})[tm] = triplemap.subject_map subjectonly = True else: touninon.setdefault(rdfmt, {})[tm] = triplemap.predicate_object_map if len(touninon) > 1 or subjectonly: return self.makeunion(touninon, query_filters, subjectonly) elif len(touninon) == 1: query, projvartocols, coltotemplates, database_name = self.makeJoin(touninon[list(touninon.keys())[0]], query_filters) return query, projvartocols, coltotemplates, database_name else: return None, None, None, None else: mapping_preds = {tm: triplemap for tm, triplemap in self.mappings.items() for p in star_preds if p in triplemap.predicate_object_map} for tm, triplemap in mapping_preds.items(): for rdfmt in triplemap.subject_map.rdf_types: if rdfmt in rdfmts and len(set(star_preds).intersection(list(triplemap.predicate_object_map.keys()))) == len(set(star_preds)): completematch[rdfmt] = {} completematch[rdfmt][tm] = triplemap.predicate_object_map if rdfmt in rdfmts and len(set(star_preds).intersection(list(triplemap.predicate_object_map.keys()))) > 0: touninon.setdefault(rdfmt, {})[tm] = triplemap.predicate_object_map if len(completematch) > 0: if len(completematch) == 1: query, projvartocols, coltotemplates, database_name = self.makeJoin( touninon[list(touninon.keys())[0]], query_filters) return query, projvartocols, coltotemplates, database_name else: return self.makeunion(completematch, query_filters) elif len(touninon) > 1: return self.makeunion(touninon, query_filters) elif len(touninon) == 1: query, projvartocols, coltotemplates, database_name = self.makeJoin(touninon[list(touninon.keys())[0]], query_filters) return query, projvartocols, coltotemplates, database_name else: return None, None, None, None
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time # start = time() # print("Start:", start) if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)] if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters) # print(sqlquery) # totalres = 0 if sqlquery is None or len(sqlquery) == 0: queue.put("EOF") return [] try: start = time() try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill", ex) queue.put("EOF") return if not self.drill.is_active(): print('Exception: Please run Drill first') queue.put("EOF") return # print("Drill Initialization cost:", time() - start) logger.info("Drill Initialization cost:" + str(time() - start)) start = time() if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] if len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] # logger.info(" UNION ".join(sqlquery)) processqueues = [] processes = [] res_dict = [] for sql in sqlquery: # processquery = Queue() # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict) # print(sql) processquery = Queue() processqueues.append(processquery) p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,)) p.start() processes.append(p) while len(processqueues) > 0: toremove = [] try: for q in processqueues: if q.get(False) == 'EOF': toremove.append(q) for p in processes: if p.is_alive(): p.terminate() except: pass for q in toremove: processqueues.remove(q) logger.info("Done running:") sw = " UNION ".join(sqlquery) logger.info(sw) else: card = 0 # if limit == -1: limit = 1000 if offset == -1: offset = 0 logger.info(sqlquery) # print(sqlquery) while True: query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset) cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates) card += cardinality if cardinality < limit: break offset = offset + limit # print("Exec in Drill took:", time() - start) logger.info("Exec in Drill took:" + str(time() - start)) except Exception as e: print("Exception ", e) pass # print('End:', time(), "Total results:", totalres) # print("Drill finished after: ", (time()-start)) queue.put("EOF")
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') yelp_reviews = drill.query(''' select count(*) as `count` from `mongo.274_BI`.`yelp_dataset`t where true=repeated_contains(categories,'Restaurants') and t.city='Charlotte' ''') print dumps(yelp_reviews) # pandas dataframe
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n" #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city = "Las Vegas" yelp_reviews = drill.query(''' select categories[0] as category, count(categories[0]) as number from `dfs.yelp`.`yelp_academic_dataset_business.json` where true=repeated_contains(categories,'Restaurants') and city='Charlotte' group by categories[0] order by count(categories[0]) desc limit 5 ''') print dumps(yelp_reviews) # pandas dataframe
Execution.getArgs() nvmlInit() drill = "drill" #None spark = "spark" compareResults = True if 'compare_results' in Settings.data['RunSettings']: compareResults = Settings.data['RunSettings']['compare_results'] if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR: # Create Table Drill ------------------------------------------------------------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host = 'localhost', port = 8047) cs.init_drill_schema(drill, Settings.data['TestSettings']['dataDirectory']) # Create Table Spark ------------------------------------------------------------------------------------------------------ spark = SparkSession.builder.appName("timestampTest").getOrCreate() cs.init_spark_schema(spark, Settings.data['TestSettings']['dataDirectory']) #Create Context For BlazingSQL bc, dask_client = init_context() nRals = Settings.data['RunSettings']['nRals'] main(dask_client, drill, spark, Settings.data['TestSettings']['dataDirectory'], bc, nRals) if Settings.execution_mode != ExecutionMode.GENERATOR:
def get_column_types(query): drill = PyDrill(host='localhost', port=8049) data = drill.query(query) columns = data.columns types = {} formattedQuery = sqlparse.format(query, reindent=True, keyword_case='upper') formattedQuery = formattedQuery.split('\n') print(sqlparse.format(query, reindent=True, keyword_case='upper')) inSelect = False inSubquery = False inFromClause = False fields = [] fieldRegex = r'\s{7}\S' fieldSubquery = r'\s' subqueryFieldPattern = r'\s{2,3}\S' subqueryField = "" fromClause = "" functionPattern = r'\s+(\S+)\(' fieldCount = 0 for line in formattedQuery: functionMatchObject = re.match(functionPattern, line) if line.startswith('SELECT'): inSelect = True line = line.replace('SELECT', '') line = line.strip() #remove trailing comma if len(line) > 0: if line[-1:] == ",": line = line[:-1] fields.append(line) # If the line is a function, assign the correct return type elif inSelect and inFromClause == False and functionMatchObject: print("FieldCount: " + str(fieldCount) + " " + line) functionCandidate = functionMatchObject.group(1) functionCandidate = functionCandidate.upper() if functionCandidate in _BIG_INT_FUNCTIONS: types[columns[fieldCount]] = "bigint" elif functionCandidate in _INT_FUNCTIONS: types[columns[fieldCount]] = "integer" elif functionCandidate in _FLOAT_FUNCTIONS: types[columns[fieldCount]] = "float" else: types[columns[fieldCount]] = "varchar" fieldCount += 1 continue # Case for a regular field elif inSelect == True and re.match(fieldRegex, line): line = line.strip() # remove trailing comma from field name if len(line) > 0: if line[-1:] == ",": line = line[:-1] fields.append(line) elif inSelect == True and line.startswith('FROM'): inSelect = False inFromClause = True if inSubquery: fields.append(subqueryField) inSubquery = False else: fromClause = fromClause + " " + line.strip() elif inFromClause == True and (line.startswith('WHERE') or line.startswith('GROUP') or line.startswith('ORDER') or line.startswith('HAVING')): inFromClause = False inSelect = False elif re.match(subqueryFieldPattern, line) and inSubquery == False and inFromClause == False: inSubquery = True subqueryField = line.strip() elif inSubquery == True: subqueryField = subqueryField + " " + line.strip() if line.endswith(','): inSubquery = False fields.append(subqueryField) subqueryField = "" elif inSubquery == True and line == False: inSubquery = False fields.append(subqueryField) subqueryField = "" elif inFromClause == True: fromClause = fromClause + " " + line.strip() fieldCount += 1 typeQuery = "SELECT" fieldCount = 0 aliasPattern = r'AS\s`?[a-zA-Z_][a-zA-Z0-9-_$` ]*$' for field in fields: if re.search(aliasPattern, field): field = re.sub(aliasPattern, '', field) if fieldCount > 0: typeQuery += "," typeQuery = typeQuery + " " + field + " AS " + columns[ fieldCount] + ", typeof( " + field + ") AS " + columns[ fieldCount] + "_type" fieldCount += 1 typeQuery += fromClause typeQuery += " LIMIT 1" typeQuery = sqlparse.format(typeQuery, reindent=True, keyword_case='upper') print(typeQuery) fieldQueryResult = drill.query(typeQuery).to_dataframe() tempTypes = fieldQueryResult.T.to_dict()[0] for column in columns: if column not in types.keys(): types[column] = tempTypes[column + "_type"] print(types) return types
#!/usr/local/bin/python from pydrill.client import PyDrill import json from bson.json_util import dumps from bson import json_util #print "Content-type: application/json\n\n"; print "Content-type: text/html\n\n"; #print """<p>hehy</p>""" #drill = PyDrill(host='localhost', port=8047) from pymongo import MongoClient drill = PyDrill(host='localhost', port=8047) if not drill.is_active(): raise ImproperlyConfigured('Please run Drill first') city="Las Vegas" yelp_reviews = drill.query(''' select sum(case when t.stars=1.0 then 1 else 0 end) as `1`, sum(case when t.stars=2.0 then 1 else 0 end) as `2`, sum(case when t.stars=2.5 then 1 else 0 end) as `3`, sum(case when t.stars=3.0 then 1 else 0 end) as `4`, sum(case when t.stars=3.5 then 1 else 0 end) as `5`, sum(case when t.stars=4.0 then 1 else 0 end) as `6`, sum(case when t.stars=4.5 then 1 else 0 end) as `7`, sum(case when t.stars=5.0 then 1 else 0 end) as `8` from `mongo.274_BI`.`yelp_dataset`t where t.city='Pittsburgh' and true=repeated_contains(categories,'Restaurants') ''') print dumps(yelp_reviews)
class DataService: __drill = None __dictionary = queryDictionary.QueryDictionary def __init__(self): self.init_drill_connection(); def init_drill_connection(self): self.__drill = PyDrill(host='localhost') is_drill_active = self.__drill.is_active() if is_drill_active: print("Drill is active: %s" % is_drill_active) else: print(''' Drill is not active. Start your server in a terminal using command: cd /Users/Aymeric/apache-drill-1.5.0 bin/drill-embedded ''') def get_users(self): query = self.__dictionary.get_users() results = self.__drill.query(query) return self.get_frame(results) def get_elite_users(self): query = self.__dictionary.get_elite() results = self.__drill.query(query) return self.get_frame(results) def get_elite_users_count(self): query = self.__dictionary.get_elite_count() results = self.__drill.query(query) return self.get_frame(results) def get_elite_users_tip(self): query = self.__dictionary.get_elite_tip() results = self.__drill.query(query) return self.get_frame(results) def get_elite_users_review(self): query = self.__dictionary.get_elite_review() results = self.__drill.query(query, 30) return self.get_frame(results) def get_restaurant_review(self): query = self.__dictionary.get_review() results = self.__drill.query(query, 30) return self.get_frame(results) def get_featureset1_but_votes(self): query = self.__dictionary.get_featureset1_but_votes() results = self.__drill.query(query, 300) return self.get_frame(results) def get_user_review(self, review_id): query = self.__dictionary.get_user_review(review_id) results = self.__drill.query(query, 30) return self.get_frame(results) def review_dates(self): query = self.__dictionary.review_dates() t0 = time.clock(); print('started on: ' + str(datetime.datetime.now())) results = self.__drill.query(query, 600) print('finished on: ' + str(datetime.datetime.now())) return self.get_frame(results) @staticmethod def get_frame(results, number_of_records=None): return DataFrame(data=results.rows, columns=results.columns) @staticmethod def print_frame(frame, records_to_display=None): # frame = DataService.get_frame(results, records_to_display) print("\n") if records_to_display is None: print(frame.to_string(justify='right')) else: print(frame[:records_to_display].to_string(justify='left')) def main(self): self.print_frame(self.get_elite_users(), 10)