def query2Hash_test(db, printOutput): query2 = db.query().fromTable('part') \ .join(db.query().fromTable('lineitem').where('L_RETURNFLAG == \'R\''), rhsSchema=db.relationSchema('lineitem'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(L_PARTKEY) % 111', rhsKeySchema=DBSchema('L_PARTKEY', [('L_PARTKEY', 'int')])) \ .groupBy(groupSchema=DBSchema('P_NAME', [('P_NAME', 'char(55)')]), aggSchema=DBSchema('COUNT', [('COUNT', 'int')]), groupExpr=(lambda e: e.P_NAME), aggExprs=[(0, lambda acc, e: acc + 1, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .finalize() start = time.time() processedQuery = [ query2.schema().unpack(tup) for page in db.processQuery(query2) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 2, Hash') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def query1(db): query = db.query().fromTable('lineitem')\ .where( "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and (0.06-0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24")\ .groupBy( groupSchema=DBSchema('groupKey', [('ONE', 'int')]), groupExpr=(lambda e: 1), aggSchema=DBSchema('groupBy', [('revenue', 'float')]), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal) % 1))\ .select( {'revenue': ('revenue', 'float')})\ .finalize() return query
def testHashJoin(self): schema = self.db.relationSchema('employee') e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'}) keySchema = DBSchema('employeeKey', [('id', 'int')]) keySchema2 = DBSchema('employeeKey2', [('id2', 'int')]) hashJoin = self.db.query().fromTable('employee').join( \ self.db.query().fromTable('employee'), \ rhsSchema=e2schema, \ method='hash', \ lhsHashFn='hash(id) % 4', lhsKeySchema=keySchema, \ rhsHashFn='hash(id2) % 4', rhsKeySchema=keySchema2, \ ).finalize() results = self.getResults(hashJoin) self.assertEqual(len(results), self.numEmployees)
def validateSchema(self): if self.lhsPlan.schema().match(self.rhsPlan.schema()): schemaName = self.operatorType() + str(self.id()) schemaFields = self.lhsPlan.schema().schema() self.unionSchema = DBSchema(schemaName, schemaFields) else: raise ValueError("Union operator type error, mismatched input schemas")
def createRelation(self, relationName, relationFields): if relationName not in self.relationMap: schema = DBSchema(relationName, relationFields) self.relationMap[relationName] = schema self.storage.createRelation(relationName, schema) self.checkpoint() else: raise ValueError("Relation '" + relationName + "' already exists")
def query2(db): query = db.query().fromTable('part')\ .join( db.query().fromTable('lineitem').where("(L_SHIPDATE >= 19950901) and (L_SHIPDATE < 19951001)"), method='hash', lhsHashFn='hash(P_PARTKEY) % 7', lhsKeySchema=DBSchema('partkey2',[('P_PARTKEY', 'int')]), rhsHashFn='hash(L_PARTKEY) % 7', rhsKeySchema=DBSchema('partkey1', [('L_PARTKEY', 'int')])) \ .groupBy( groupSchema=DBSchema('groupKey', [('ONE', 'int')]), aggSchema=DBSchema('groupBy', [('promo_revenue', 'float')]), groupExpr=(lambda e: 1), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 1))\ .select( {'promo_revenue' : ('promo_revenue', 'float')}).finalize() return query
def query3bBNL_test(db, printOutput): query3 = db.query().fromTable('part') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_PARTKEY == P_PARTKEY') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .join(db.query().fromTable('customer'), rhsSchema=db.relationSchema('customer'), method='block-nested-loops', expr='O_CUSTKEY == C_CUSTKEY') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='C_NATIONKEY == N_NATIONKEY') \ .groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)'), ('P_NAME', 'char(55)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.N_NAME, e.P_NAME)), aggExprs=[(0, lambda acc, e: acc + e.L_QUANTITY, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .groupBy(groupSchema=DBSchema('SECOND', [('N_NAME', 'char(25)')]), aggSchema=DBSchema('MAX', [('MAX', 'double')]), groupExpr=(lambda e: e.N_NAME), aggExprs=[(0, lambda acc, e: max(acc, e.SUM), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'NATION': ('N_NAME', 'char(25)'), 'MAX': ('MAX', 'double')}) \ .finalize() start = time.time() processedQuery = [ query3.schema().unpack(tup) for page in db.processQuery(query3) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 3b (Order Reversed), BNL') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def testGroupBy(self): # SELECT id, min(age), max(age) FROM Employee GROUP BY (id % 2) aggMinMaxSchema = DBSchema('minmax', [('minAge', 'int'), ('maxAge', 'int')]) keySchema = DBSchema('employeeKey', [('id', 'int')]) groupBy = self.db.query().fromTable('employee').groupBy( \ groupSchema=keySchema, \ aggSchema=aggMinMaxSchema, \ groupExpr=(lambda e: e.id % 2), \ aggExprs=[(sys.maxsize, lambda acc, e: min(acc, e.age), lambda x: x), \ (0, lambda acc, e: max(acc, e.age), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal[0]) % 2) \ ).finalize() results = self.getResults(groupBy) self.assertEqual(len(results), 2) for result in results: if result.id == 0: self.assertEqual(result.minAge, 20) self.assertEqual(result.maxAge, 56) else: self.assertEqual(result.minAge, 22) self.assertEqual(result.maxAge, 58)
def query3(db): query = db.query().fromTable('customer')\ .join( db.query().fromTable('orders'), method='hash', lhsHashFn='hash(C_CUSTKEY) % 5', lhsKeySchema=DBSchema('customerKey1', [('C_CUSTKEY', 'int')]), rhsHashFn='hash(O_CUSTKEY) % 5', rhsKeySchema=DBSchema('customerKey2', [('O_CUSTKEY', 'int')]))\ .join( db.query().fromTable('lineitem'), method='hash', lhsHashFn='hash(O_ORDERKEY) % 5', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]), rhsHashFn='hash(L_ORDERKEY) % 5', rhsKeySchema=DBSchema('orderkey2', [('L_ORDERKEY', 'int')]))\ .where( "C_MKTSEGMENT == 'BUILDING' and O_ORDERDATE < 19950315 and L_SHIPDATE > 19950315")\ .groupBy( groupSchema=DBSchema('groupKey', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'int'), ('O_SHIPPRIORITY', 'int')]), aggSchema=DBSchema('groupAgg', [('revenue', 'float')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 10))\ .select( {'l_orderkey': ('L_ORDERKEY', 'int'), 'revenue': ('revenue', 'float'), 'o_orderdate': ('O_ORDERDATE', 'int'), 'o_shippriority': ('O_SHIPPRIORITY', 'int')})\ .finalize() return query
def testIndexJoin(self): schema = self.db.relationSchema('employee') e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'}) lhsKeySchema = DBSchema('employeeAge', [('age2', 'int')]) join = self.db.query().fromTable('employee').join( \ self.db.query().fromTable('employee'), \ lhsSchema=e2schema, \ lhsKeySchema=lhsKeySchema, \ method='indexed', \ indexId=self.idxId, \ expr='True' ).finalize() results = self.getResults(join) self.assertEqual(len(results), self.numEmployees)
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) # Start with a fresh 'employee' table self.db = Database.Database() self.db.createRelation('employee', [('id', 'int'), ('age', 'int'), ('dept_id', 'int')]) # Populate employees with department 0 or 1 self.numEmployees = 20 empSchema = self.db.relationSchema('employee') # Create an index over employee age keySchema = DBSchema('employeeAge', [('age', 'int')]) self.idxId = self.db.storageEngine().createIndex('employee', empSchema, keySchema, False) for tup in [empSchema.pack(empSchema.instantiate(i, 2 * i + 20, i % 2)) for i in range(self.numEmployees)]: self.db.insertTuple(empSchema.name, tup)
def query1Hash_test(db, printOutput): query1 = db.query().fromTable('part') \ .join(db.query().fromTable('partsupp').where('PS_AVAILQTY == 1'), rhsSchema=db.relationSchema('partsupp'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='hash', lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]), rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')])) \ .union( db.query().fromTable('part') .join(db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5'), rhsSchema=db.relationSchema('partsupp'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='hash', lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]), rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')]))) \ .select({'P_NAME': ('P_NAME', 'char(55)'), 'S_NAME': ('S_NAME', 'char(25)')}) \ .finalize() start = time.time() processedQuery = [ query1.schema().unpack(tup) for page in db.processQuery(query1) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 1, Hash') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def processAllPages(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() outputSchema = DBSchema(schema, fields) relIds = [] for (pageId, page) in iter(self.subPlan): for tpl in page: group = self.groupExpr(self.subSchema.unpack(tpl)) key = self.groupHashFn((group, None)) relId = str(self.id) + "u" + str(key) self.storage.createRelation(relId, self.subSchema) self.storage.insertTuple(relId, tpl) if relId not in relIds: relIds.append(relId) for rid in relIds: groupDict = {} for (pageId, page) in self.storage.pages(rid): for tpl in page: groupKey = self.groupExpr(self.subSchema.unpack(tpl)) if groupKey not in groupDict: groupDict[groupKey] = [] for trio in self.aggExprs: groupDict[groupKey].append(trio[0]) for i in range(len(self.aggExprs)): groupDict[groupKey][i] = self.aggExprs[i][1]( groupDict[groupKey][i], self.subSchema.unpack(tpl)) for key in groupDict: for i in range(len(self.aggExprs)): groupDict[key][i] = self.aggExprs[i][2](groupDict[key][i]) for key in groupDict: outTuple = outputSchema.instantiate( key, *[f for f in groupDict[key]]) self.emitOutputTuple(self.outputSchema.pack(outTuple)) return self.storage.pages(self.relationId())
def query4(db): query = db.query().fromTable('nation')\ .join( db.query().fromTable('customer'), method='hash', lhsHashFn='hash(N_NATIONKEY) % 11', lhsKeySchema=DBSchema('nationKey1', [('N_NATIONKEY', 'int')]), rhsHashFn='hash(C_NATIONKEY) % 11', rhsKeySchema=DBSchema('nationKey2', [('C_NATIONKEY', 'int')]))\ .join( db.query().fromTable('orders'), method='hash', lhsHashFn='hash(C_CUSTKEY) % 11', lhsKeySchema=DBSchema('custKey1', [('C_CUSTKEY', 'int')]), rhsHashFn='hash(O_CUSTKEY) % 11', rhsKeySchema=DBSchema('custKey2', [('O_CUSTKEY', 'int')])) \ .join( db.query().fromTable('lineitem'), method='hash', lhsHashFn='hash(O_ORDERKEY) % 11', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]), rhsHashFn='hash(L_ORDERKEY) % 11', rhsKeySchema=DBSchema('orderKey2', [('L_ORDERKEY', 'int')])) \ .where( "O_ORDERDATE >= 19931001 and O_ORDERDATE < 19940101 and L_RETURNFLAG == 'R'")\ .groupBy( groupSchema=DBSchema('groupKey', [('C_CUSTKEY','int'),('C_NAME','char(25)'),('C_ACCTBAL','float'),('C_PHONE','char(15)'),('N_NAME','char(25)'),('C_ADDRESS','char(40)'),('C_COMMENT','char(117)')]), aggSchema=DBSchema('groupAgg', [('revenue', 'float')]), groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.N_NAME, e.C_ADDRESS, e.C_COMMENT)), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 11))\ .select( {'c_custkey': ('C_CUSTKEY', 'int'), 'c_name': ('C_NAME', 'char(25)'), 'revenue': ('revenue', 'float'), 'c_acctbal': ('C_ACCTBAL', 'float'), 'n_name': ('N_NAME', 'char(25)'), 'c_address': ('C_ADDRESS', 'char(40)'), 'c_phone': ('C_PHONE', 'char(15)'), 'c_comment': ('C_COMMENT', 'char(117)')})\ .finalize() return query
def validate(self, operator, storage): if operator.operatorType() == "TableScan": return operator.schema() elif operator.operatorType() == "Select": operator.storage = storage return self.validate(operator.subPlan, storage) elif operator.operatorType() == "Project": operator.storage = storage self.validate(operator.subPlan, storage) return DBSchema( operator.relationId(), \ [(k, v[1]) for (k,v) in operator.projectExprs.items()]) elif operator.operatorType() == "GroupBy": self.validate(operator.subPlan, storage) operator.storage = storage return operator.schema() elif operator.operatorType() == "UnionAll": operator.storage = storage return self.validate(operator.subPlan, storage) else: operator.lhsSchema = self.validate(operator.lhsPlan, storage) operator.rhsSchema = self.validate(operator.rhsPlan, storage) operator.initializeSchema() operator.storage = storage return operator.schema()
def tests(printOutput): db = setup() '''select sum(l_extendedprice * l_discount) as revenue from lineitem where l_shipdate >= 19940101 and l_shipdate < 19950101 and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24''' query1 = db.query().fromTable('lineitem') \ .where('L_SHIPDATE >= 19940101') \ .where('L_SHIPDATE < 19950101') \ .where('L_DISCOUNT > 0.06 - 0.01') \ .where('L_DISCOUNT < 0.06 + 0.01') \ .where('L_QUANTITY < 24') \ .groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (1)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * e.L_DISCOUNT, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'REVENUE': ('SUM', 'double')}) \ .finalize() '''select sum(l_extendedprice * (1 - l_discount)) as promo_revenue from lineitem, part where l_partkey = p_partkey and l_shipdate >= 19950901 and l_shipdate < 19951001''' query2 = db.query().fromTable('lineitem') \ .join(db.query().fromTable('part'), rhsSchema=db.relationSchema('part'), method='block-nested-loops', expr='L_PARTKEY == P_PARTKEY') \ .where('L_SHIPDATE >= 19950901') \ .where('L_SHIPDATE < 19951001') \ .groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (1)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'PROMO_REVENUE': ('SUM', 'double')}) \ .finalize() '''select l_orderkey, sum(l_extendedprice * (1 - l_discount)) as revenue, o_orderdate, o_shippriority from customer, orders, lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < 19950315 and l_shipdate > 19950315 group by l_orderkey, o_orderdate, o_shippriority''' query3 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('O_ORDERDATE < 19950315') \ .where('L_SHIPDATE > 19950315') \ .groupBy(groupSchema=DBSchema('FIRST', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'char(10)'), ('O_SHIPPRIORITY', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'L_ORDERKEY': ('L_ORDERKEY', 'int'), 'REVENUE': ('SUM', 'double'),'O_ORDERDATE': ('O_ORDERDATE', 'char(10)'),'O_SHIPPRIORITY': ('O_SHIPPRIORITY', 'int')}) \ .finalize() '''select c_custkey, c_name, sum(l_extendedprice * (1 - l_discount)) as revenue, c_acctbal, n_name, c_address, c_phone, c_comment from customer, orders, lineitem, nation where c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate >= 19931001 and o_orderdate < 19940101 and l_returnflag = 'R' and c_nationkey = n_nationkey group by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment''' query4 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('O_ORDERDATE >= 19931001') \ .where('O_ORDERDATE < 19940101') \ .where('L_RETURNFLAG == \'R\'') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='C_NATIONKEY == N_NATIONKEY') \ .groupBy(groupSchema=DBSchema('FIRST', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'), ('C_ACCTBAL', 'double'), ('C_PHONE', 'char(15)'), ('N_NAME', 'char(25)'),('C_ADDRESS', 'char(40)'),('C_COMMENT', 'char(117)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.C_ADDRESS, e.C_PHONE, e.C_COMMENT)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'C_CUSTKEY' : ('C_CUSTKEY', 'int'), 'C_NAME' : ('C_NAME', 'char(25)'),'SUM' : ('SUM', 'double'), 'C_ACCTBAL' : ('C_ACCTBAL', 'double'), 'C_PHONE' : ('C_PHONE', 'char(15)'), 'N_NAME' : ('N_NAME', 'char(25)'),'C_ADDRESS' : ('C_ADDRESS', 'char(40)'),'C_COMMENT' : ('C_COMMENT', 'char(117)')}) \ .finalize() '''select n_name, sum(l_extendedprice * (1 - l_discount)) as revenue from customer, orders, lineitem, supplier, nation, region where c_custkey = o_custkey and l_orderkey = o_orderkey and l_suppkey = s_suppkey and c_nationkey = s_nationkey and s_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'ASIA' and o_orderdate >= 19940101 and o_orderdate < 19950101 group by n_name''' query5 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='block-nested-loops', expr='L_SUPPKEY == S_SUPPKEY and C_NATIONKEY == S_NATIONKEY') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='S_NATIONKEY == N_NATIONKEY') \ .join(db.query().fromTable('region'), rhsSchema=db.relationSchema('region'), method='block-nested-loops', expr='N_REGIONKEY == R_REGIONKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('R_NAME == \'ASIA\'') \ .where('O_ORDERDATE >= 19940101') \ .where('O_ORDERDATE < 19950101') \ .groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'N_NAME': ('N_NAME', 'char(25)'), 'SUM' : ('SUM', 'double')}) \ .finalize() queries = [query1, query2, query3, query4, query5] names = ['query1', 'query2', 'query3', 'query4', 'query5'] opt = Optimizer.Optimizer(db) for i,q in enumerate(queries): querytest(db, printOutput, q, names[i], opt)
def initializeSchemas(self): tpchNamesAndFields = [ ('part', [('P_PARTKEY', 'int'), ('P_NAME', 'char(55)'), ('P_MFGR', 'char(25)'), ('P_BRAND', 'char(10)'), ('P_TYPE', 'char(25)'), ('P_SIZE', 'int'), ('P_CONTAINER', 'char(10)'), ('P_RETAILPRICE', 'double'), ('P_COMMENT', 'char(23)')], "issssisds"), ('supplier', [('S_SUPPKEY', 'int'), ('S_NAME', 'char(25)'), ('S_ADDRESS', 'char(40)'), ('S_NATIONKEY', 'int'), ('S_PHONE', 'char(15)'), ('S_ACCTBAL', 'double'), ('S_COMMENT', 'char(101)')], "issisds"), ('partsupp', [('PS_PARTKEY', 'int'), ('PS_SUPPKEY', 'int'), ('PS_AVAILQTY', 'int'), ('PS_SUPPLYCOST', 'double'), ('PS_COMMENT', 'char(199)')], "iiids"), ('customer', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'), ('C_ADDRESS', 'char(40)'), ('C_NATIONKEY', 'int'), ('C_PHONE', 'char(15)'), ('C_ACCTBAL', 'double'), ('C_MKTSEGMENT', 'char(10)'), ('C_COMMENT', 'char(117)')], "issisdss"), ( 'orders', [ ('O_ORDERKEY', 'int'), ('O_CUSTKEY', 'int'), ('O_ORDERSTATUS', 'char(1)'), ('O_TOTALPRICE', 'double'), ('O_ORDERDATE', 'int'), # date ('O_ORDERPRIORITY', 'char(15)'), ('O_CLERK', 'char(15)'), ('O_SHIPPRIORITY', 'int'), ('O_COMMENT', 'char(79)') ], "iisdtssis"), ( 'lineitem', [ ('L_ORDERKEY', 'int'), ('L_PARTKEY', 'int'), ('L_SUPPKEY', 'int'), ('L_LINENUMBER', 'int'), ('L_QUANTITY', 'double'), ('L_EXTENDEDPRICE', 'double'), ('L_DISCOUNT', 'double'), ('L_TAX', 'double'), ('L_RETURNFLAG', 'char(1)'), ('L_LINESTATUS', 'char(1)'), ('L_SHIPDATE', 'int'), # date ('L_COMMITDATE', 'int'), # date ('L_RECEIPTDATE', 'int'), # date ('L_SHIPINSTRUCT', 'char(25)'), ('L_SHIPMODE', 'char(10)'), ('L_COMMENT', 'char(44)') ], "iiiiddddsstttsss"), ('nation', [('N_NATIONKEY', 'int'), ('N_NAME', 'char(25)'), ('N_REGIONKEY', 'int'), ('N_COMMENT', 'char(152)')], "isis"), ('region', [('R_REGIONKEY', 'int'), ('R_NAME', 'char(25)'), ('R_COMMENT', 'char(152)')], "iss") ] self.schemas = dict( map(lambda x: (x[0], DBSchema(x[0], x[1])), tpchNamesAndFields)) self.parsers = dict( map(lambda x: (x[0], self.buildParser(x[2])), tpchNamesAndFields))
def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields)
def checkJoins(self, joins, allAttributes, cachedLHS, cachedRHS): for join in joins: if join.joinMethod == 'hash': lhsCheck = join.lhsKeySchema.fields # This is a list rhsCheck = join.rhsKeySchema.fields # This is a list joinAttr = lhsCheck + rhsCheck contains = all(x in allAttributes for x in joinAttr) and any( y in cachedRHS.schema().fields for y in joinAttr) # print(lhsCheck, rhsCheck, contains) if contains: keySchema1 = DBSchema('keyschema1', [(lhsCheck[0], 'int')]) keySchema2 = DBSchema('keyschema2', [(rhsCheck[0], 'int')]) hashFn1 = 'hash(' + lhsCheck[0] + ') % 7' hashFn2 = 'hash(' + rhsCheck[0] + ') % 7' joinExpr = lhsCheck[0] + ' == ' + rhsCheck[0] if lhsCheck[0] in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } elif join.joinExpr: joinAttr = ExpressionInfo(join.joinExpr).getAttributes() contains = all(x in allAttributes for x in joinAttr) if contains: joinAttr1 = joinAttr.pop() joinAttr2 = joinAttr.pop() try: joinAttr.pop() except KeyError: pass # print('There are only two attributes in this, as expected') keySchema1 = DBSchema('keySchema1', [([joinAttr1, 'int'])]) keySchema2 = DBSchema('keySchema1', [([joinAttr2, 'int'])]) hashFn1 = 'hash(' + joinAttr1 + ') % 7' hashFn2 = 'hash(' + joinAttr2 + ') % 7' joinExpr = join.joinExpr if joinAttr1 in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } else: return False, None
def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields)
from customer, orders, lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < 19950315 and l_shipdate > 19950315 group by l_orderkey, o_orderdate, o_shippriority ''' ls1 = DBSchema('customerKey1', [('C_CUSTKEY', 'int')]) rs1 = DBSchema('customerKey2', [('O_CUSTKEY', 'int')]) ls2 = DBSchema('orderKey1', [('O_ORDERKEY', 'int')]) rs2 = DBSchema('orderkey2', [('L_ORDERKEY', 'int')]) groupKeySchema = DBSchema('groupKey', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'int'), ('O_SHIPPRIORITY', 'int')]) groupAggSchema = DBSchema('groupAgg', [('revenue', 'float')]) query3 = db.query().fromTable('customer').join( \ db.query().fromTable('orders'), method = 'hash', \ lhsHashFn = 'hash(C_CUSTKEY) % 5', lhsKeySchema = ls1, \ rhsHashFn = 'hash(O_CUSTKEY) % 5', rhsKeySchema = rs1).join( \
from Storage.BufferPool import BufferPool from Catalog.Schema import DBSchema from Storage.StorageEngine import StorageEngine schemas = [ DBSchema('part', [('p_partkey', 'int'), ('p_name', 'char(55)'), ('p_mfgr', 'char(25)'), ('p_brand', 'char(10)'), ('p_type', 'char(25)'), ('p_size', 'int'), ('p_container', 'char(10)'), ('p_retailprice', 'float'), ('p_comment', 'char(23)')]) , DBSchema('supplier', [('s_suppkey', 'int'), ('s_name', 'char(25)'), ('s_address', 'char(40)'), ('s_nationkey', 'int'), ('s_phone', 'char(15)'), ('s_acctbal', 'float'), ('s_comment', 'char(101)')]) , DBSchema('partsupp', [('ps_partkey', 'int'), ('ps_suppkey', 'int'), ('ps_availqty', 'int'), ('ps_supplycost', 'float'), ('ps_comment', 'char(199)')])
part = db.query().fromTable('part').select({ 'P_NAME': ('P_NAME', 'char(55)'), 'P_PARTKEY': ('P_PARTKEY', 'int') }) partsupp = db.query().fromTable('partsupp').where('PS_AVAILQTY == 1').select({ 'PS_PARTKEY': ('PS_PARTKEY', 'int'), 'PS_SUPPKEY': ('PS_SUPPKEY', 'int') }) supplier = db.query().fromTable('supplier').select({ 'S_NAME': ('S_NAME', 'char(25)'), 'S_SUPPKEY': ('S_SUPPKEY', 'int') }) join_ps_p = partsupp.join(\ part, \ rhsSchema = DBSchema('part', [('P_NAME','char(55)'), ('P_PARTKEY','int')]), \ method = 'block-nested-loops', \ expr = 'PS_PARTKEY == P_PARTKEY') join_three = join_ps_p.join(\ supplier, \ rhsSchema = DBSchema('supplier', [('S_NAME','char(25)'), ('S_SUPPKEY', 'int')]), \ method = 'block-nested-loops', expr = 'PS_SUPPKEY == S_SUPPKEY' ).select({'P_NAME':('P_NAME','char(55)'), 'S_NAME':('S_NAME','char(25)')}) partsupp2 = db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5').select( { 'PS_PARTKEY': ('PS_PARTKEY', 'int'), 'PS_SUPPKEY': ('PS_SUPPKEY', 'int') })
exp 1 TPC-H Query 6: a 4-chain filter and aggregate query select sum(l_extendedprice * l_discount) as revenue from lineitem where l_shipdate >= 19940101 and l_shipdate < 19950101 and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24 ''' groupKeySchema = DBSchema('groupKey', [('ONE', 'int')]) groupAggSchema = DBSchema('groupBy', [('revenue', 'float')]) query1 = db.query().fromTable('lineitem').where( \ "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and \ (0.06 - 0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24)" ).groupBy( \ groupSchema=groupKeySchema, \ aggSchema=groupAggSchema, \ groupExpr=(lambda e: 1), \ aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal) % 1)).select( \ {'revenue' : ('revenue', 'float')}).finalize() ''' exp 2 TPC-H Query 14: a 2-way join and aggregate query
schema1.pack( schema1.instantiate(i, random.randint(0, 50), 'This is a testing tuple.')) for i in range(0, 10000) ]: _ = db.insertTuple(schema1.name, tup) for tup in [ schema2.pack( schema2.instantiate(i, random.randint(0, 50), 'This is a testing tuple.')) for i in range(0, 10000) ]: _ = db.insertTuple(schema2.name, tup) keySchema = DBSchema('employeeKey', [('id', 'int')]) keySchema2 = DBSchema('employeeKey2', [('id2', 'int')]) query5 = db.query().fromTable('employeeeee').join( \ db.query().fromTable('employeeeee2'), \ rhsSchema=schema2, \ method='hash', \ lhsHashFn=(lambda e : e.id % 4), lhsKeySchema=keySchema, \ rhsHashFn=(lambda e : e.id2 % 4), rhsKeySchema=keySchema2, \ ).finalize() print(query5.explain()) def readResult(): for page in db.processQuery(query5):
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 n = len(aPaths) planList = [] costList = [] # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' total cost as 0. planList.append(aPath) costList.append((numPages, 0)) # i = 2...n for i in range(1, n): # find all possible two way join in current planList # put the potential joins in potentialP # put the potential joins cost in potentialC m = len(planList) potentialP = [] potentialC = [] for j in range(0, m - 1): for k in range(j + 1, m): self.pcntr += 1 potentialP.append((planList[j], planList[k])) potentialC.append(3 * (costList[j][0] + costList[k][0]) + costList[j][1] + costList[k][1]) # find the cheapest joinable join (total cost) # build the join, remove the used two base plan and add the new join to planList # modify the costList as well while (potentialC): currC = min(potentialC) currP = potentialP[potentialC.index(currC)] potentialC.remove(currC) potentialP.remove(currP) if (self.joinable(operator, currP)): (lField, rField) = self.joinable(operator, currP) lhsSchema = currP[0].schema() rhsSchema = currP[1].schema() lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in lhsSchema.schema() if f == lField]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in rhsSchema.schema() if f == rField]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(currP[0], currP[1], method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) newJoin.prepare(self.db) totalCost = currC cards = Plan(root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) id1 = planList.index(currP[0]) _ = planList.pop(id1) id2 = planList.index(currP[1]) _ = planList.pop(id2) planList.append(newJoin) _ = costList.pop(id1) _ = costList.pop(id2) costList.append((pages, totalCost)) break print("GreedyOptimizer plan considered: ", self.pcntr) return planList[0]
import io, math, os, os.path, pickle, struct from struct import Struct from Catalog.Identifiers import PageId, FileId, TupleId from Catalog.Schema import DBSchema from Storage.Page import PageHeader, Page from Storage.SlottedPage import SlottedPageHeader, SlottedPage from Storage.File import * import shutil, Storage.BufferPool, Storage.FileManager schema = DBSchema('employee', [('id', 'int'), ('age', 'int')]) bp = Storage.BufferPool.BufferPool() fm = Storage.FileManager.FileManager(bufferPool=bp) bp.setFileManager(fm) fm.createRelation(schema.name, schema) (fId, f) = fm.relationFile(schema.name) f.numPages() == 0 pId = PageId(fId, 0) pId1 = PageId(fId, 1) p = Page(pageId=pId, buffer=bytes(f.pageSize()), schema=schema) p1 = Page(pageId=pId1, buffer=bytes(f.pageSize()), schema=schema)
import Database from Query.Optimizer import Optimizer from Query.BushyOptimizer import BushyOptimizer from Catalog.Schema import DBSchema from time import time db = Database.Database() deptSchema = DBSchema('department', [('d_id', 'int'), ('d_name', 'char(30)')]) emplSchema = DBSchema('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')]) projSchema = DBSchema('project', [('p_id', 'int'), ('p_name', 'char(30)')]) gratSchema = DBSchema('grant', [('g_id', 'int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) synSchema1 = DBSchema('syn1', [('a', 'int'), ('b', 'char(30)')]) synSchema2 = DBSchema('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')]) db.createRelation('department', [('d_id', 'int'), ('d_name', 'char(30)')]) db.createRelation('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')]) db.createRelation('project', [('p_id', 'int'), ('p_name', 'char(30)')]) db.createRelation('grant', [('g_id', 'int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) db.createRelation('syn1', [('a', 'int'), ('b', 'char(30)')]) db.createRelation('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')]) for tup in [ deptSchema.pack(deptSchema.instantiate(i, "Nature" + str(i))) for i in range(4000) ]: _ = db.insertTuple('department', tup) for tup in [
def __init__(self, subPlan, projectExprs, **kwargs): super().__init__(**kwargs) self.subPlan = subPlan self.projectExprs = projectExprs self.outputSchema = DBSchema(self.relationId(), \ [(k, v[1]) for (k,v) in self.projectExprs.items()])
def query5(db): query = db.query().fromTable('region')\ .join( db.query().fromTable('nation'), method='hash', lhsHashFn='hash(R_REGIONKEY) % 11', lhsKeySchema=DBSchema('regionKey1', [('R_REGIONKEY', 'int')]), rhsHashFn='hash(N_REGIONKEY) % 11', rhsKeySchema=DBSchema('regionKey2', [('N_REGIONKEY', 'int')]))\ .join( db.query().fromTable('supplier'), method='hash', lhsHashFn='hash(N_NATIONKEY) % 11', lhsKeySchema=DBSchema('nationKey1', [('N_NATIONKEY', 'int')]), rhsHashFn='hash(S_NATIONKEY) % 11', rhsKeySchema=DBSchema('nationKey2', [('S_NATIONKEY', 'int')])) \ .join( db.query().fromTable('lineitem'), method='hash', lhsHashFn='hash(S_SUPPKEY) % 11', lhsKeySchema=DBSchema('suppKey1', [('S_SUPPKEY', 'int')]), rhsHashFn='hash(L_SUPPKEY) % 11', rhsKeySchema=DBSchema('suppKey2', [('L_SUPPKEY', 'int')])) \ .join( db.query().fromTable('orders'), method='hash', lhsHashFn='hash(L_ORDERKEY) % 11', lhsKeySchema=DBSchema('orderKey1', [('L_ORDERKEY', 'int')]), rhsHashFn='hash(O_ORDERKEY) % 11', rhsKeySchema=DBSchema('orderKey2', [('O_ORDERKEY', 'int')])) \ .join( db.query().fromTable('customer'), method='hash', lhsHashFn='hash(O_CUSTKEY) % 11', lhsKeySchema=DBSchema('custKey1', [('O_CUSTKEY', 'int')]), rhsHashFn='hash(C_CUSTKEY) % 11', rhsKeySchema=DBSchema('custKey2', [('C_CUSTKEY', 'int')])) \ .where( "R_NAME == 'ASIA' and O_ORDERDATE >= 19940101 and O_ORDERDATE < 19950101")\ .groupBy( groupSchema=DBSchema('groupKey', [('N_NAME','char(25)')]), aggSchema=DBSchema('groupAgg', [('revenue', 'float')]), groupExpr=(lambda e: e.N_NAME), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 11))\ .select( {'n_name': ('n_name', 'char(25)'), 'revenue': ('revenue', 'float')})\ .finalize() return query