def __init__(self, dataID, b_allow_repeated_value=True, logger_file=logger_file, num_of_points=None, base_models=[tools.app_linear]): self.logger = logs.QueryLogs(log=logger_file) # self.logger.set_no_output() self.data = dl.load2d(dataID) if not b_allow_repeated_value: self.data.remove_repeated_x_1d() self.cRegression = CRegression(logger_object=self.logger, base_models=base_models) self.cRegression.fit(self.data) # self.logger.set_logging(file_name=logger_file) if num_of_points is None: self.qe = QueryEngine(self.cRegression, logger_object=self.logger) else: self.qe = QueryEngine(self.cRegression, logger_object=self.logger, num_training_points=num_of_points) self.qe.density_estimation() self.q_min = min(self.data.features) self.q_max = max(self.data.features) self.dataID = dataID del self.data
param_dict = {'high': 3, 'low': 4} print('Welcome to the command line interface of the weather project') loc = input('Location: ') date_time = input('Date (+Time): ') hourly_daily = input('Hourly or Daily Data?: ') historical_scraping = input('Historical or Scraping Data?: ') parameters = input('Which parameters?: ') #loc = 'berlin' #date_time = 20160412 #hourly_daily = 'd' #historical_scraping = 'h' #parameters = 'high' qe = QueryEngine() #loc = city_dict[loc] #parameters = param_dict[parameters] data = qe.get_data_point(loc, date_time, hourly_daily, historical_scraping, parameters) print('...') print('...') print('...') print('result:', data) #geo_locations = get_geo_locations()[:2,:] #print (geo_locations) #print(geo_locations.shape)
class Query_Engine_2d: def __init__(self,dataID,b_allow_repeated_value=True,logger_file=logger_file,num_of_points=None): self.logger = logs.QueryLogs(log=logger_file) # self.logger.set_no_output() self.data = dl.load2d(dataID) if not b_allow_repeated_value: self.data.remove_repeated_x_1d() self.cRegression = CRegression(logger_object=self.logger) self.cRegression.fit(self.data) # self.logger.set_logging(file_name=logger_file) if num_of_points is None: self.qe = QueryEngine(self.cRegression, logger_object=self.logger) else: self.qe = QueryEngine(self.cRegression, logger_object=self.logger,num_training_points=num_of_points) self.qe.density_estimation() self.q_min = min(self.data.features) self.q_max = max(self.data.features) self.dataID = dataID #warnings.filterwarnings(action='ignore', category=DeprecationWarning) def query_2d_avg(self,l=0,h=100): """query to 2d data sets. Args: l (int, optional): query lower boundary h (int, optional): query higher boundary """ avgs,time = self.qe.approximate_avg_from_to(l ,h, 0) #0.05E8,0.1E8, return avgs, time def query_2d_sum(self,l=0,h=100): """query to 2d data sets. Args: l (int, optional): query lower boundary h (int, optional): query higher boundary """ sums,time = self.qe.approximate_sum_from_to(l ,h, 0) return sums, time def query_2d_count(self,l=0,h=100): count,time = self.qe.approximate_count_from_to(l ,h, 0) return count, time def query_2d_variance_x(self,l=0,h=100): variance_x,time = self.qe.approximate_variance_x_from_to(l ,h, 0) return variance_x, time def query_2d_variance_y(self,l=0,h=100): variance_y,time = self.qe.approximate_variance_y_from_to(l ,h, 0) return variance_y, time def query_2d_covariance(self,l=0,h=100): covariance,time = self.qe.approximate_covar_from_to(l ,h, 0) return covariance, time def query_2d_correlation(self,l=0,h=100): correlation,time = self.qe.approximate_corr_from_to(l ,h, 0) return correlation, time def query_2d_percentile(self, p): percentile,time = self.qe.approximate_percentile_from_to(p, self.q_min, self.q_max) return percentile, time def mass_query_sum(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT SUM("+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_sum(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_avg(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT AVG("+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_avg(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_count(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT COUNT("+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_count(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_variance_x(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT VARIANCE("+x+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_variance_x(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_variance_y(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT VARIANCE("+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_variance_y(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_covariance(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT COVARIANCE("+x+", "+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_covariance(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_correlation(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT CORR("+x+", "+y+") FROM " + str(table) +" WHERE "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_correlation(l=q_left,h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) self.logger.logger.info("") return exact_results, approx_results, exact_times, approx_times def mass_query_percentile(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number): q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number)) q_centre = random.uniform(0,1) sqlStr = "SELECT percentile_cont("+x+", "+str(q_centre)+") FROM " + str(table) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_percentile(q_centre) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning("MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) self.logger.logger.info("") return exact_results, approx_results, exact_times, approx_times def relative_error(self,exact_results, approx_results): # abs_errors = [abs(i - j) for i, j in zip(exact_results,approx_results) ] rel_errors = [abs(i - j)*1.0/i for i, j in zip(exact_results,approx_results) ] # abs_time_reduction = [(j - i) for i, j in zip(exact_times, approx_times) ] result = sum(rel_errors)/len(rel_errors) self.logger.logger.warning("Relative error is : " + str(result)) return result def time_ratio(self,exact_times, approx_times): result = sum(approx_times)/sum(exact_times) self.logger.logger.warning("Time ratio is : " + str(result)) return result def query2hive(self,sql="SHOW TABLES",use_server=True): if use_server: host = "137.205.118.65" else: host = "localhost" with pyhs2.connect(host=host, port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: #Show databases # print cur.getDatabases() #Execute query # cur.execute("select * from src") start = datetime.now() cur.execute(sql) end = datetime.now() time_cost = (end - start).total_seconds() #Return column info from query # print cur.getSchema() #Fetch table results self.logger.logger.info("Time spent for HIVE query: %.4fs." % time_cost) for i in cur.fetch(): self.logger.logger.info(i) return i[0], time_cost def query2mysql(self,sql="SHOW TABLES",use_server=True): # Open database connection # db = MySQLdb.connect("127.0.0.1","hiveuser","bayern","hivedb" ) if use_server: db = pymysql.connect("137.205.118.65","u1796377","bayern","hivedb",port=3306 ) else: db = pymysql.connect("127.0.0.1","hiveuser","bayern","hivedb",port=3306 ) # prepare a cursor object using cursor() method cursor = db.cursor() # Drop table if it already exist using execute() method. # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE") # Create table as per requirement # sql = sql start = datetime.now() cursor.execute(sql) results = cursor.fetchall() end = datetime.now() time_cost = (end - start).total_seconds() self.logger.logger.info("Time spent for MYSQL query: %.4fs." % time_cost) for row in results: self.logger.logger.info(row) # disconnect from server db.close() return row[0], time_cost
from query_engine import QueryEngine if __name__ == '__main__': query = QueryEngine([ 'Search_Engine/data/pg135.txt', 'Search_Engine/data/pg76.txt', 'Search_Engine/data/pg5200.txt' ])
def init_whole_range(self,file, table, columnItems, num_of_points=None): """Build DBEst table for a table, with different combinations of column pairs. Args: table (str): table names, for example: 'store_sales' file (str): path to the file columnItem (list): columnItem of each table, for example, [["ss_quantity", "ss_ext_discount_amt"],...] num_of_points (dict, optional): Description Deleted Parameters: tableColumnSets (None, optional): Description """ start_time = datetime.now() tableColumnSets = [[table]+columnItems[i] for i in range(len(columnItems))] # self.logger.logger.info(tableColumnSets) if not hasattr(self,'df'): self.df={} if self.dataset == "tpcds": # initialise the column set, tables in TPC-DS dataset. if num_of_points is None: num_of_points = {} num_of_points["store_sales"] = 2685596178 num_of_points["web_page"] = 3000 num_of_points["time_dim"] = 86400 num_of_points["web_sales"] = 720000376 # merge dict num_of_points to self.num_of_points # self.logger.logger.info(num_of_points) self.num_of_points = {**self.num_of_points, **num_of_points} # self.logger.logger.info(self.num_of_points) if tableColumnSets is None: tableColumnSets = [ ["store_sales", "ss_quantity", "ss_ext_discount_amt"] # , # ["store_sales", "ss_quantity", "ss_ext_sales_price"], # ["store_sales", "ss_quantity", "ss_ext_list_price"], # ["store_sales", "ss_quantity", "ss_ext_tax"], # ["store_sales", "ss_quantity", "ss_net_paid"], # ["store_sales", "ss_quantity", "ss_net_paid_inc_tax"], # ["store_sales", "ss_quantity", "ss_net_profit"], # ["store_sales", "ss_quantity", "ss_list_price"], # ["store_sales", "ss_list_price", "ss_list_price"], # ["store_sales", "ss_coupon_amt", "ss_list_price"], # ["store_sales", "ss_wholesale_cost", "ss_list_price"], # ["store_sales", "ss_sales_price", "ss_quantity"], # ["store_sales", "ss_net_profit", "ss_quantity"] # , # ["web_page", "wp_char_count", "wp_link_count"], # * # ["time_dim", "t_minute", "t_hour"], # * # ["web_sales", "ws_sales_price", "ws_quantity"] ] # self.logger.logger.info(tableColumnSets) self.tableColumnSets = self.tableColumnSets+tableColumnSets # self.logger.logger.info(self.tableColumnSets) if self.dataset == "pp": if num_of_points is None: num_of_points = {} num_of_points["powerplant"] = 26000000000 # merge dict num_of_points to self.num_of_points self.num_of_points = {**self.num_of_points, **num_of_points} if tableColumnSets is None: tableColumnSets = [ # ["store_sales", "ss_quantity", "*"], ["powerplant", "T", "EP"], ["powerplant", "AP", "EP"], ["powerplant", "RH", "EP"] ] self.tableColumnSets = self.tableColumnSets+tableColumnSets tables = [sublist[0] for sublist in self.tableColumnSets] self.uniqueTables = list(set(tables)) self.logger.logger.info( "Dataset contains " + str(len(self.uniqueTables)) + " tables, which are:") self.logger.logger.info(self.uniqueTables) self.uniqueTCS = [] for element in self.tableColumnSets: if element not in self.uniqueTCS: self.uniqueTCS.append(element) self.numOfCsOfTables = [tables.count(uniqueTableName) for uniqueTableName in self.uniqueTables] self.logger.logger.info( "Talbes in the dataset need " + str(self.numOfCsOfTables) + " Column Sets.") # get column set in each table self.CSinTable = {} for uniqueTable in self.uniqueTables: columnSet = [[item[1], item[2]] for item in self.uniqueTCS if item[0] is uniqueTable] self.logger.logger.debug(columnSet) self.CSinTable[uniqueTable] = columnSet self.logger.logger.debug(self.CSinTable) # load data for uniqueTable in self.uniqueTables: self.df[uniqueTable] = pd.read_csv(file) # self.logger.logger.info(df.to_string()) self.df[uniqueTable] = self.df[uniqueTable].dropna() if uniqueTable in self.DBEstClients: DBEstiClient = self.DBEstClients[uniqueTable] else: DBEstiClient = {} # store all QeuryEngines within each table for columnItem in self.CSinTable[uniqueTable]: if ((uniqueTable in self.DBEstClients) and (str(columnItem) not in self.DBEstClients[uniqueTable]) or uniqueTable not in self.DBEstClients): # the if sentence above judges whether previous model has # been trained, if so, skip re-train it self.logger.logger.info( "--------------------------------------------------") self.logger.logger.info("Start training Qeury Engine for" + " Table " + uniqueTable + ", Column Set: " + str(columnItem)) headerX = columnItem[0] headerY = columnItem[1] x = self.df[uniqueTable][[headerX]].values y = self.df[uniqueTable][[headerY]].values.reshape(-1) data = DataSource() data.features = x data.labels = y data.headers = columnItem data.file = file # self.data.removeNAN() cRegression = CRegression(logger_object=self.logger,base_models=self.base_models) cRegression.fit(data) qe = QueryEngine( cRegression, logger_object=self.logger, num_training_points=self.num_of_points[uniqueTable]) qe.density_estimation() cRegression.clear_training_data() # qe.get_size() DBEstiClient[str(columnItem)] = qe self.logger.logger.info("Finish training Qeury Engine " + "for Table " + uniqueTable + ", Column Set: " + str(columnItem)) self.logger.logger.info( "--------------------------------------------------") self.logger.logger.debug(DBEstiClient) self.DBEstClients[uniqueTable] = DBEstiClient else: self.logger.logger.info("This model exsits, not need to train, so just skip training! ") self.logger.logger.info(self.DBEstClients) # self.logger.logger.info(json.dumps(DBEstClients, indent=4)) end_time = datetime.now() time_cost = (end_time - start_time).total_seconds() self.logger.logger.info( "DBEsti has been initialised, ready to serve... (%.1fs)" % time_cost)
def init_groupby(self, file="../data/tpcDs10k/store_sales.csv", table="store_sales", group="ss_store_sk", columnItem=["ss_wholesale_cost", "ss_list_price"], num_of_points_per_group=None): """ support simple group by, Args: table (str, optional): table name group (str, optional): column name of the group columnItem (list, optional): [x, y], in list format num_of_points_per_group (Dictionary, optional): store the total number of points of each group """ start_time = datetime.now() self.logger.logger.info("") self.logger.logger.info("Start building GROUP BY for Table " + table) self.df[table] = pd.read_csv(file) self.df[table] = self.df[table].dropna() # self.logger.logger.info(self.df[table]["ss_sold_date_sk"]) # self.df[group] =pd.Series([], dtype=int) grouped = self.df[table].groupby(group) group_name = str([table, group]) self.group_names[group_name] = [] if table not in self.DBEstClients: self.DBEstClients[table] = {} # initiate the number of points per group if num_of_points_per_group is None: self.logger.logger.error("Please provide the information\ (num_of_points_per_group) for init_groupby() ") sys.exit() else: self.num_of_points_per_group_tbls[str( [table, group])] = num_of_points_per_group for grp_name, group in grouped: self.group_names[group_name].append(grp_name) columnItemGroup = str(columnItem) + "-" + str(grp_name) self.logger.logger.debug( "--------------------------------------------------") self.logger.logger.debug( "Start building groupy by for " + columnItemGroup) headerX = columnItem[0] headerY = columnItem[1] x = group[[headerX]].values y = group[[headerY]].values.reshape(-1) data = DataSource() data.features = x data.labels = y data.headers = columnItem data.file = columnItemGroup # self.data.removeNAN() cRegression = CRegression( logger_object=self.logger, b_cross_validation=True,base_models=self.base_models) cRegression.fit(data) # # # # # # # # # # number of points self.logger.logger.info(grp_name) # self.logger.logger.info(group) qe = QueryEngine( cRegression, logger_object=self.logger, num_training_points=int( num_of_points_per_group[str((int(grp_name)))])) # num_of_points_per_group[str(int(grp_name))])) qe.density_estimation() cRegression.clear_training_data() # qe.get_size() self.DBEstClients[table][columnItemGroup] = qe self.logger.logger.info( "Start building groupy by for " + columnItemGroup) self.logger.logger.info( "--------------------------------------------------") # result, time = self.query_2d_percentile(float(line)) end_time = datetime.now() time_cost = (end_time - start_time).total_seconds() self.logger.logger.info( "GROUP BY has been initialised, ready to serve... (%.1fs)" % time_cost)
#!/usr/bin/env python3 from query_engine import QueryEngine print("file name: ") file = str(input()) qe = None if file is '': qe = QueryEngine() else: qe = QueryEngine(file) # random comment passing through print("search for: ") search_term = str(input()) result = qe.search_corp(search_term)
result_str = "Please contact the customer support for assistance regarding the same." if len(processed_text) >= 1: result_str = "<" + processed_text[0][ "Link"] + "|" + processed_text[0]["Heading"] + ">\n" result_str += utility.formatted_text(' '.join( processed_text[0]["Content"].split()[:100])) + "\n" if len(processed_text) >= 2: result_str += "*Related article(s):*" + "\n" result_str += "<" + processed_text[1][ "Link"] + "|" + processed_text[1]["Heading"] + ">\n" if len(processed_text) >= 3: result_str += "<" + processed_text[2][ "Link"] + "|" + processed_text[2]["Heading"] + ">\n" #print(result_str) #posting the message to the slack channel if len(result_str) > 0: slack_web_client.chat_postMessage(channel=channel_id, text=result_str, thread_ts=thread_ts_id) #main function for the app if __name__ == "__main__": logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler()) query_engine = QueryEngine() app.run(host='0.0.0.0', port=3000)
class Query_Engine_2d: def __init__(self, dataID, b_allow_repeated_value=True, logger_file=logger_file, num_of_points=None, base_models=[tools.app_linear]): self.logger = logs.QueryLogs(log=logger_file) # self.logger.set_no_output() self.data = dl.load2d(dataID) if not b_allow_repeated_value: self.data.remove_repeated_x_1d() self.cRegression = CRegression(logger_object=self.logger, base_models=base_models) self.cRegression.fit(self.data) # self.logger.set_logging(file_name=logger_file) if num_of_points is None: self.qe = QueryEngine(self.cRegression, logger_object=self.logger) else: self.qe = QueryEngine(self.cRegression, logger_object=self.logger, num_training_points=num_of_points) self.qe.density_estimation() self.q_min = min(self.data.features) self.q_max = max(self.data.features) self.dataID = dataID del self.data #warnings.filterwarnings(action='ignore', category=DeprecationWarning) def query_2d_avg(self, l=0, h=100): """query to 2d data sets. Args: l (int, optional): query lower boundary h (int, optional): query higher boundary """ avgs, time = self.qe.approximate_avg_from_to(l, h, 0) # 0.05E8,0.1E8, return avgs, time def query_2d_sum(self, l=0, h=100): """query to 2d data sets. Args: l (int, optional): query lower boundary h (int, optional): query higher boundary """ sums, time = self.qe.approximate_sum_from_to(l, h, 0) return sums, time def query_2d_count(self, l=0, h=100): count, time = self.qe.approximate_count_from_to(l, h, 0) return count, time def query_2d_variance_x(self, l=0, h=100): variance_x, time = self.qe.approximate_variance_x_from_to(l, h, 0) return variance_x, time def query_2d_variance_y(self, l=0, h=100): variance_y, time = self.qe.approximate_variance_y_from_to(l, h, 0) return variance_y, time def query_2d_covariance(self, l=0, h=100): covariance, time = self.qe.approximate_covar_from_to(l, h, 0) return covariance, time def query_2d_correlation(self, l=0, h=100): correlation, time = self.qe.approximate_corr_from_to(l, h, 0) return correlation, time def query_2d_percentile(self, p): percentile, time = self.qe.approximate_percentile_from_to( p, self.q_min, self.q_max) return percentile, time def query_2d_min(self, l=0, h=100, ci=True, confidence=0.95): _min, time = self.qe.approximate_min_from_to(l, h, ci=ci, confidence=confidence) return _min, time def query_2d_max(self, l=0, h=100, ci=True, confidence=0.95): _max, time = self.qe.approximate_max_from_to(l, h, ci=ci, confidence=confidence) return _max, time def mass_query_sum(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min, self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT SUM(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_sum(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_avg(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min, self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT AVG(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_avg(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_count(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min, self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT COUNT(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_count(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_variance_x(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min,self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT VARIANCE(" + x + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_variance_x(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_variance_y(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min,self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT VARIANCE(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_variance_y(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_covariance(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min,self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT COVARIANCE(" + x + ", " + y + ") FROM " + str( table) + " WHERE " + x + " BETWEEN " + str( q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_covariance(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_correlation(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 # random.seed(1.0) random_left_boundary = self.q_min + q_range_half_length random_right_boundary = self.q_max - q_range_half_length self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): query_centres.append(random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) # random.uniform(self.q_min,self.q_max) q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT CORR(" + x + ", " + y + ") FROM " + str( table) + " WHERE " + x + " BETWEEN " + str( q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_correlation(l=q_left, h=q_right) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) self.logger.logger.info("") return exact_results, approx_results, exact_times, approx_times def mass_query_percentile(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) q_centre = random.uniform(0, 1) sqlStr = "SELECT percentile_cont(" + x + \ ", " + str(q_centre) + ") FROM " + str(table) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_percentile(q_centre) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) self.logger.logger.info("") return exact_results, approx_results, exact_times, approx_times def mass_query_min(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) q_centre = random.uniform(self.q_min, self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT MIN(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_min(l=q_left[0], h=q_right[0]) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def mass_query_max(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=True): q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 random.seed(1.0) exact_results = [] exact_times = [] approx_results = [] approx_times = [] for i in range(number): self.logger.logger.info("start query No." + str(i + 1) + " out of " + str(number)) q_centre = random.uniform(self.q_min, self.q_max) q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length sqlStr = "SELECT MAX(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0]) self.logger.logger.info(sqlStr) approx_result, approx_time = self.query_2d_max(l=q_left[0], h=q_right[0]) self.logger.logger.info(approx_result) exact_result, exact_time = self.query2mysql(sql=sqlStr) self.logger.logger.info(exact_result) if (exact_result is not None) and (exact_result is not 0): exact_results.append(exact_result) exact_times.append(exact_time) approx_results.append(approx_result) approx_times.append(approx_time) else: self.logger.logger.warning( "MYSQL returns None, so this record is ignored.") self.logger.logger.warning("MYSQL query results: " + str(exact_results)) self.logger.logger.warning("MYSQL query time cost: " + str(exact_times)) self.logger.logger.warning("Approximate query results: " + str(approx_results)) self.logger.logger.warning("Approximate query time cost: " + str(approx_times)) return exact_results, approx_results, exact_times, approx_times def relative_error(self, exact_results, approx_results): # abs_errors = [abs(i - j) for i, j in zip(exact_results,approx_results) ] rel_errors = [ abs(i - j) * 1.0 / i for i, j in zip(exact_results, approx_results) ] # abs_time_reduction = [(j - i) for i, j in zip(exact_times, approx_times) ] result = sum(rel_errors) / len(rel_errors) self.logger.logger.warning("Relative error is : " + str(result)) return result def time_ratio(self, exact_times, approx_times): result = sum(approx_times) / sum(exact_times) self.logger.logger.warning("Time ratio is : " + str(result)) return result # def query2hive(self, sql="SHOW TABLES", use_server=True): # if use_server: # host = "137.205.118.65" # else: # host = "localhost" # with pyhs2.connect(host=host, # port=10000, # authMechanism="NOSASL", # user='******', # password='******', # database='default') as conn: # with conn.cursor() as cur: # # Show databases # # print cur.getDatabases() # # Execute query # # cur.execute("select * from src") # start = datetime.now() # cur.execute(sql) # end = datetime.now() # time_cost = (end - start).total_seconds() # # Return column info from query # # print cur.getSchema() # # Fetch table results # self.logger.logger.info( # "Time spent for HIVE query: %.4fs." % time_cost) # for i in cur.fetch(): # self.logger.logger.info(i) # return i[0], time_cost def query2hive2(self, sql="SHOW TABLES", use_server=True): if use_server: host = "137.205.118.65" else: host = "localhost" conn = hive.connect(host=host, port=10000, username='******', auth='NOSASL') cursor = conn.cursor() start = datetime.now() cursor.execute(sql) end = datetime.now() time_cost = (end - start).total_seconds() for result in cursor.fetchall(): self.logger.logger.info(result) return result[0], time_cost def query2mysql(self, sql="SHOW TABLES", use_server=True): # Open database connection # db = MySQLdb.connect("127.0.0.1","hiveuser","bayern","hivedb" ) if use_server: db = pymysql.connect("137.205.118.65", "u1796377", "bayern", "hivedb", port=3306) else: db = pymysql.connect("127.0.0.1", "hiveuser", "bayern", "hivedb", port=3306) # prepare a cursor object using cursor() method cursor = db.cursor() # Drop table if it already exist using execute() method. # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE") # Create table as per requirement # sql = sql start = datetime.now() cursor.execute(sql) results = cursor.fetchall() end = datetime.now() time_cost = (end - start).total_seconds() self.logger.logger.info("Time spent for MYSQL query: %.4fs." % time_cost) for row in results: self.logger.logger.info(row) # disconnect from server db.close() return row[0], time_cost def mass_query(self, file, agg_func='avg', ci=True, confidence=0.95): AQP_results = [] time_costs = [] index = 0 with open(file) as fin: for line in fin: self.logger.logger.info("Starting Query " + str(index) + ":") index = index + 1 if agg_func is 'percentile': # print(line) result, time = self.query_2d_percentile(float(line)) AQP_results.append(result) time_costs.append(time) # print(result) if agg_func is 'avg': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_avg(l, h) AQP_results.append(result) time_costs.append(time) if agg_func is 'sum': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_sum(l, h) AQP_results.append(result) time_costs.append(time) if agg_func is 'count': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_count(l, h) AQP_results.append(result) time_costs.append(time) if agg_func is 'variance_x': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_variance_x(l, h) AQP_results.append(result) time_costs.append(time) if agg_func is 'min': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_min(l, h, ci=ci, confidence=confidence) AQP_results.append(result) time_costs.append(time) if agg_func is 'max': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_max(l, h, ci=ci, confidence=confidence) AQP_results.append(result) time_costs.append(time) if agg_func is 'covar': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_covariance(l, h) AQP_results.append(result) time_costs.append(time) if agg_func is 'corr': lh = line.split(",") l = float(lh[0]) h = float(lh[1]) result, time = self.query_2d_correlation(l, h) AQP_results.append(result) time_costs.append(time) self.logger.logger.info(AQP_results) self.logger.logger.info(time_costs) return AQP_results def generate_queries(self, table, x="ss_list_price", y="ss_wholesale_cost", percent=5, number=default_mass_query_number, b_random_queries=False, mode=2, file_pre="etrade_"): """Summary Args: table (TYPE): TABLE name x (str, optional): x y (str, optional): y percent (int, optional): query width, or query domain number (TYPE, optional): number of queries generated b_random_queries (bool, optional): Description mode (str, optional): 1 for percentile queries, QRegQL; 2 for other queries, QRegQL; 1 for percentile queries, SQL or HIVEQL; 2 for other queries, SQL or HIVEQL; """ if mode is 1: file_name_qreg = file_pre + "percentile" + str(percent) + ".qreg" file_name_hiveql = file_pre + \ "percentile" + str(percent) + ".hiveql" elif mode is 2: file_name_qreg = file_pre + "queries" + str(percent) + ".qreg" file_name_hiveql = file_pre + "queries" + str(percent) + ".hiveql" else: self.logger.logger.error( "Failed to generate queries, no mode selected!") sys.exit(errno.EPERM) q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0 self.logger.logger.info("Start generating queries") if b_random_queries: random.seed(1.0) query_centres = [] for i in range(number): if mode is 1: # _start = (self.q_max - self.q_min) * 0.4 # query_centres.append(random.uniform( # self.q_min + _start, self.q_max)[0]) query_centres.append(random.uniform(0.4, 1)) else: query_centres.append( random.uniform(self.q_min, self.q_max)[0]) else: query_centres = generate_random.make_user_distribution(self.qe.kde, self.q_min, self.q_max, n=number) self.logger.logger.info("Finish generating " + str(number) + " queries, the center points are:") self.logger.logger.info(str(query_centres)) exact_results = [] exact_times = [] approx_results = [] approx_times = [] if mode is 1: with open(file_name_hiveql, 'w+') as f_hiveql: with open(file_name_qreg, 'w+') as f_qreg: for i in range(number): q_centre = query_centres[i] qregStr = str(q_centre) sqlStr = "SELECT percentile_cont(" + x + \ ", " + str(q_centre) + ") FROM " + str(table) f_hiveql.write(sqlStr + "\n") f_qreg.write(qregStr + "\n") else: with open(file_name_hiveql, 'w+') as f_hiveql: with open(file_name_qreg, 'w+') as f_qreg: aggregates = ["COUNT", "SUM", "AVG", "MIN", "MAX"] for aggregate in aggregates: for i in range(number): q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length qregStr = str(q_left[0]) + ", " + str(q_right[0]) sqlStr = "SELECT " + aggregate + "(" + y + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + \ str(q_left[0]) + " AND " + str(q_right[0]) f_hiveql.write(sqlStr + "\n") if aggregate is "COUNT": f_qreg.write(qregStr + "\n") for i in range(number): q_centre = query_centres[i] q_left = q_centre - q_range_half_length q_right = q_centre + q_range_half_length qregStr = str(q_left[0]) + ", " + str(q_right[0]) sqlStr = "SELECT " + "VARIANCE" + "(" + x + ") FROM " + str(table) + " WHERE " + \ x + " BETWEEN " + \ str(q_left[0]) + " AND " + str(q_right[0]) f_hiveql.write(sqlStr + "\n") self.logger.logger.info("Queries are written to file " + file_name_qreg + " and " + file_name_hiveql) def mass_query2mysql(self, file): AQP_results = [] time_costs = [] index = 1 with open(file) as fin: for line in fin: self.logger.logger.info("Starting Query " + str(index) + ":") index = index + 1 self.logger.logger.info(line) result, time = self.query2mysql(sql=line) AQP_results.append(result) time_costs.append(time) self.logger.logger.info(AQP_results) self.logger.logger.info(time_costs) return AQP_results def mass_query2hive(self, file): AQP_results = [] time_costs = [] index = 1 with open(file) as fin: for line in fin: try: self.logger.logger.info("Starting Query " + str(index) + ":") index = index + 1 self.logger.logger.info(line) result, time = self.query2hive2(sql=line) AQP_results.append(result) time_costs.append(time) except Exception as e: import smtplib server = smtplib.SMTP('smtp.gmail.com', 587) server.starttls() server.login("*****@*****.**", "warwickpass") msg = "failed!" server.sendmail("*****@*****.**", "*****@*****.**", msg) server.quit() import sys sys.exit("Error message") self.logger.logger.info(AQP_results) self.logger.logger.info(time_costs) return AQP_results