def processAllPages(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() outputSchema = DBSchema(schema, fields) relIds = [] for (pageId, page) in iter(self.subPlan): for tpl in page: group = self.groupExpr(self.subSchema.unpack(tpl)) key = self.groupHashFn((group, None)) relId = str(self.id) + "u" + str(key) self.storage.createRelation(relId, self.subSchema) self.storage.insertTuple(relId, tpl) if relId not in relIds: relIds.append(relId) for rid in relIds: groupDict = {} for (pageId, page) in self.storage.pages(rid): for tpl in page: groupKey = self.groupExpr(self.subSchema.unpack(tpl)) if groupKey not in groupDict: groupDict[groupKey] = [] for trio in self.aggExprs: groupDict[groupKey].append(trio[0]) for i in range(len(self.aggExprs)): groupDict[groupKey][i] = self.aggExprs[i][1]( groupDict[groupKey][i], self.subSchema.unpack(tpl)) for key in groupDict: for i in range(len(self.aggExprs)): groupDict[key][i] = self.aggExprs[i][2](groupDict[key][i]) for key in groupDict: outTuple = outputSchema.instantiate( key, *[f for f in groupDict[key]]) self.emitOutputTuple(self.outputSchema.pack(outTuple)) return self.storage.pages(self.relationId())
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): blockIds = [] while bufPool.numFreePages() > 0: try: pId, page = next(pageIterator) bufPool.getPage(pId, pinned=True) blockIds.append(pId) except StopIteration: pageIterator = None break return (blockIds, pageIterator) def blockNestedLoops(self): self._blockNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan)) return self.storage.pages(self.relationId()) def _blockNestedLoops(self, lPageIter, rPageIter): while lPageIter is not None: blockIds, lPageIter = self.accessPageBlock(self.storage.bufferPool, lPageIter) for lPageId in blockIds: lPage = self.storage.bufferPool.getPage(lPageId) for lTuple in lPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rPage) in rPageIter: for rTuple in rPage: joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) if self.joinExpr: isValid = eval(self.joinExpr, globals(), joinExprEnv) else: # For some reason using this comparison causes the test to fail. # # lKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) # rKey = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) # isValid = lKey == rKey isValid = True if isValid: outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] self.storage.bufferPool.unpinPage(lPageId) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): lRelHashMap = self.hashPartition(self.lhsPlan, self.lhsHashFn, self.lhsSchema, "_lhs") rRelHashMap = self.hashPartition(self.rhsPlan, self.rhsHashFn, self.rhsSchema, "_rhs") for hashVal in lRelHashMap.keys(): lPageIter = self.storage.pages(lRelHashMap[hashVal]) rPageIter = self.storage.pages(rRelHashMap[hashVal]) self._blockNestedLoops(lPageIter, rPageIter) self.storage.removeRelation(lRelHashMap[hashVal]) self.storage.removeRelation(rRelHashMap[hashVal]) return self.storage.pages(self.relationId()) def hashPartition(self, plan, hashFn, schema, side): relHashMap = {} for (pagId, page) in iter(plan): for tup in page: hashVal = str( eval(hashFn, globals(), self.loadSchema(schema, tup))) if hashVal not in relHashMap.keys(): relId = hashVal + side self.storage.createRelation(relId, schema) relHashMap[hashVal] = relId self.storage.insertTuple(relHashMap[hashVal], tup) return relHashMap # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.inputFinished = False if not self.pipelined: self.outputIterator = self.processAllPages() return self def __next__(self): if self.pipelined: while not(self.inputFinished or self.isOutputPageReady()): try: pageId, page = next(self.inputIterator) self.processInputPage(pageId, page) except StopIteration: self.inputFinished = True return self.outputPage() else: return next(self.outputIterator) #raise NotImplementedError # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pinnedPages = [] M = bufPool.numPages() count = 0 try: while count < (M-2): (pageId,pageObj) = next(pageIterator) bufPool.pinPage(pageId) pinnedPages.append((pageId, pageObj)) count += 1 except StopIteration: pass return pinnedPages def blockNestedLoops(self): lIter = iter(self.lhsPlan) pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter) while (len(pinnedPages) > 0): for (lPageId, lhsPage) in iter(pinnedPages): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] for (pageId, pageObj) in pinnedPages: self.storage.bufferPool.unpinPage(pageId) pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) joinKey = self.lhsKeySchema.pack(self.lhsSchema.project(self.lhsSchema.unpack(lTuple), self.lhsKeySchema)) #matches is an iterator over tuple IDs matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey) if not matches: continue for rTupleID in matches: rFile = self.storage.fileMgr.relationFile(self.rhsPlan.relationId())[1] pId = rTupleID.pageId rpage = rFile.bufferPool.getPage(pId) rtupleData = rpage.getTuple(rTupleID) #unpack rtupleData? joinExprEnv.update(self.loadSchema(self.rhsSchema, rtupleData)) if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) return self.storage.pages(self.relationId()) #raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): lRelIds = [] rRelIds = [] for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: hashExprEnv = self.loadSchema(self.lhsSchema, lTuple) tupleHash = eval(self.lhsHashFn, globals(), hashExprEnv) relId = str(self.id()) + "l" + str(tupleHash) self.storage.createRelation(relId, self.lhsSchema) self.storage.insertTuple(relId, lTuple) if str(tupleHash) not in lRelIds: lRelIds.append(str(tupleHash)) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: hashExprEnv = self.loadSchema(self.rhsSchema, rTuple) tupleHash = eval(self.rhsHashFn, globals(), hashExprEnv) relId = str(self.id()) + "r" + str(tupleHash) self.storage.createRelation(relId, self.rhsSchema) self.storage.insertTuple(relId, rTuple) if str(tupleHash) not in rRelIds: rRelIds.append(str(tupleHash)) if not self.joinExpr: self.joinExpr = "True" for k in range(len(self.lhsKeySchema.fields)): self.joinExpr += " and " + self.lhsKeySchema.fields[k] + " == " + self.rhsKeySchema.fields[k] for lId in lRelIds: if lId in rRelIds: ######DO BNLJ####### lIter = iter(self.storage.pages(str(self.id()) + "l" + lId)) pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter) while (len(pinnedPages) > 0): for (lPageId, lhsPage) in iter(pinnedPages): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.storage.pages(str(self.id()) + "r" + lId)): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] for (pageId, pageObj) in pinnedPages: self.storage.bufferPool.unpinPage(pageId) pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter) ######END BNLJ###### return self.storage.pages(self.relationId()) # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() # Pipelined join operator is not supported according to constructor self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") # Return an iterator to the output relation def outputRelationIterator(self): return self.storage.pages(self.relationId()) ################################## # # Nested loops implementation # def nestedLoops(self): self.runNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan), False, False, False) # Return an iterator to the output relation return self.outputRelationIterator() # Common function used by all types of joins def runNestedLoops(self, lhsPageIter, rhsPageIter, isBlock, isIndex, isHash): for (lPageId, lhsPage) in lhsPageIter: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) if isIndex: keyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) idxManager = self.storage.fileMgr.indexManager rhsPageIter = idxManager.lookupByIndex(self.indexId, keyData) for rhsItem in rhsPageIter: rhsTupleIter = None if isIndex: # Retrieve index-matched tuple from corresponding page page = self.storage.bufferPool.getPage(rhsItem.pageId) # rhsItem = rhsTupId rhsTupleIter = [page.getTuple(rhsItem)] else: # Need to scan all tuples rhsTupleIter = rhsItem[1] # rhsItem = (rPageId, rhsPage) for rTuple in rhsTupleIter: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. validJoin = False if isIndex: validJoin = True else: if isHash: lhsKeyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) rhsKeyData = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) validJoin = lhsKeyData == rhsKeyData else: validJoin = True if self.joinExpr: validJoin = validJoin and eval(self.joinExpr, globals(), joinExprEnv) if validJoin: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] if isBlock: self.storage.bufferPool.unpinPage(lPageId) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pinnedPages = list() try: while bufPool.numFreePages() > 0: (lPageId, lhsPage) = next(pageIterator) bufPool.pinPage(lPageId) pinnedPages.append((lPageId, lhsPage)) except StopIteration: pass return pinnedPages def pinPages(self, pageIterator): return self.accessPageBlock(self.storage.bufferPool, pageIterator) def blockNestedLoops(self): self.runBlockNestedLoops(iter(self.lhsPlan), self.rhsPlan, False) return self.outputRelationIterator() def runBlockNestedLoops(self, lhsPageIter, rhsPageIter, isHashJoin): pinnedPages = self.pinPages(lhsPageIter) # Keep running untill ALL pages have been loaded # Note: 'rhsPageIter' should be 'list' type NOT 'iter' while len(pinnedPages) > 0: self.runNestedLoops(iter(pinnedPages), rhsPageIter, True, False, isHashJoin) pinnedPages = self.pinPages(lhsPageIter) ################################## # # Indexed nested loops implementation # def indexedNestedLoops(self): self.runNestedLoops(iter(self.lhsPlan), None, False, True, False) return self.outputRelationIterator() ################################## # # Hash join implementation. # def hashJoin(self): lhsRelIdMap = {} rhsRelIdMap = {} # Partition each relation using hash function self.partition(self.lhsPlan, self.lhsHashFn, self.lhsSchema, lhsRelIdMap, "lhs") self.partition(self.rhsPlan, self.rhsHashFn, self.rhsSchema, rhsRelIdMap, "rhs") # Perform block nested loop join for each bucket for hashValue, relId in lhsRelIdMap.items(): lhsPageIter = self.storage.pages(relId) rhsPageIter = self.storage.pages(rhsRelIdMap[hashValue]) self.runBlockNestedLoops(lhsPageIter, list(rhsPageIter), True) # Remove partitions partitionIter = itertools.chain(lhsRelIdMap.items(), rhsRelIdMap.items()) for _, relId in partitionIter: self.storage.removeRelation(relId) return self.outputRelationIterator() # Partitions a given relation based on some hash function def partition(self, plan, hashFn, schema, relIdMap, relPrefix): for (pageId, page) in iter(plan): for tuple in page: # Compute hash value for every tuple fieldBindings = self.loadSchema(schema, tuple) hashValue = eval(hashFn, globals(), fieldBindings) # Store in temporary buckets (files) if not hashValue in relIdMap: relId = str(self.id()) + "_" + relPrefix + "_" + str(hashValue) self.storage.createRelation(relId, schema) relIdMap[hashValue] = relId self.storage.insertTuple(relIdMap[hashValue], tuple) # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete group-by specification, missing a required parameter") if not self.aggExprs: raise ValueError("Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.inputIterator = iter(self.subPlan) self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): '''if self.inputIterator is None: self.inputIterator = iter(self.subPlan) relIds = [] try: for (pageId, page) in self.inputIterator: for tuple in page: key = self.groupExpr(self.subSchema.unpack(tuple)), partition = self.groupHashFn(key) relId = "newGB_" + str(partition) if not self.storage.hasRelation(relId): self.storage.createRelation(relId, self.subSchema) relIds.append(relId) partFile = self.storage.fileMgr.relationFile(relId)[1] if partFile: partFile.insertTuple(tuple) #self.storage.insertTuple(relId, tuple) except StopIteration: pass for relId in relIds: partFile = self.storage.fileMgr.relationFile(relId)[1] groupDict = dict() for tuple in partFile.pages(): currInput = self.subSchema.unpack(tuple) key = self.subSchema.projectBinary(tuple, self.groupSchema) if key not in groupDict: currAgg = self.aggSchema.instantiate(*[e[0] for e in self.aggExprs]) else: currAgg = self.aggSchema.unpack(groupDict[key]) groupDict[key] = self.aggSchema.pack(self.aggSchema.instantiate(\ *[self.aggExprs[i][1](currAgg[i], currInput)\ for i in range(len(self.aggExprs))])) for k, v in groupDict.items(): currAgg = self.aggSchema.unpack(v) finalVal = self.aggSchema.pack(self.aggSchema.instantiate(\ *[self.aggExprs[i][2](currAgg[i]) for i in range(len(self.aggExprs))])) output = self.loadSchema(self.groupSchema, k) output.update(self.loadSchema(self.aggSchema, finalVal)) outputTuple = self.outputSchema.instantiate(*[output[f] for f in self.outputSchema.fields]) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] return self.storage.pages(self.relationId())''' self.partitionFiles = {} for (pageId, page) in self.inputIterator: for tup in page: groupVal = self.groupExpr(self.subSchema.unpack(tup)), groupId = self.groupHashFn(groupVal) partitionRelId = "GBpartition_" + str(groupId) if not self.storage.hasRelation(partitionRelId): self.storage.createRelation(partitionRelId, self.subSchema) self.partitionFiles[groupId] = partitionRelId partFile = self.storage.fileMgr.relationFile(partitionRelId)[1] if partFile: partFile.insertTuple(tup) for partitionRelId in self.partitionFiles.values(): partFile = self.storage.fileMgr.relationFile(partitionRelId)[1] groupDict = {} for (pageId, page) in partFile.pages(): for tup in page: currInput = self.subSchema.unpack(tup) key = self.groupExpr(currInput), if key not in groupDict: groupDict[key] = self.aggSchema.instantiate(*[e[0] for e in self.aggExprs]) groupDict[key] = self.aggSchema.instantiate(\ *[self.aggExprs[i][1](groupDict[key][i], currInput)\ for i in range(len(self.aggExprs))]) for (groupVal, aggVals) in groupDict.items(): finalVal = self.aggSchema.instantiate(\ *[self.aggExprs[i][2](aggVals[i]) for i in range(len(self.aggExprs))]) outputTuple = self.outputSchema.instantiate(*(list(groupVal) + list(finalVal))) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] self.removePartitionFiles() return self.storage.pages(self.relationId()) def removePartitionFiles(self): for partitionRelId in self.partitionFiles.values(): self.storage.removeRelation(partitionRelId) self.partitionFiles = {} # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) self.pidsInBlock = list() self.tempFileHashR = dict() self.outputPageHashR = dict() self.tempFileHashL = dict() self.outputPageHashL = dict() self.tempFile = None # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.inputIteratorL = iter(self.lhsPlan) self.inputFinished = False if not self.pipelined: if self.joinMethod == 'hash': self.outputIterator = self.hashJoin() elif self.joinMethod == 'block-nested-loops': self.outputIterator = self.blockNestedLoops() elif self.joinMethod == 'nested-loops': self.outputIterator = self.nestedLoops() return self def __next__(self): self.inputIteratorR = iter(self.rhsPlan) if self.pipelined: while not (self.inputFinished or self.isOutputPageReady()): try: lPageId, lhsPage = next(self.inputIteratorL) for lTuple in lhsPage: compare(lTuple) if self.outputPages: self.outputPages = [self.outputPages[-1]] except StopIteration: self.inputFinished = True return self.outputPage() else: return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. #compare(lTuple) if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) def compare(lTuple): # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate( *[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): for pid in self.pidsInBlock: bufPool.unpinPage(pid) self.pidsInBlock = list() M = bufPool.freeSpace() for i in range(0, M - 2): try: (pid, page) = next(pageIterator) except: break #if pid is None: # break self.pidsInBlock.append(pid) bufPool.getPage(pid, pinned=True) bufPool.pinPage(pid) def blockNestedLoops(self): riter = iter(self.rhsPlan) buf = self.storage.bufferPool while riter.hasNext(): self.accessPageBlock(buf, riter) for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for pid in self.pidsInBlock: rhsPage = buf.getPage(pid, pinned=True) for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] self.accessPageBlock(buf, riter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): for (rPageId, rhsPage) in iter(self.rhsPlan): for tuple in rhsPage: val = self.loadSchema(self.rhsSchema, tuple) hash = eval(self.rhsHashFn, globals(), val) self.emitOutputTupleHash(tuple, hash, False) for (lPageId, lhsPage) in iter(self.lhsPlan): for tuple in lhsPage: val = self.loadSchema(self.lhsSchema, tuple) hash = eval(self.lhsHashFn, globals(), val) self.emitOutputTupleHash(tuple, hash, True) evalStr = '' for i, lt in enumerate(self.lhsKeySchema.schema()): rt = self.rhsKeySchema.schema()[i] evalStr += str(lt[0]) + '==' + str(rt[0]) if i != 0 and i != len(self.lhsKeySchema.schema()) - 1: evalStr += ' and ' if self.joinExpr is not None: evalStr += ' and ' + self.joinExpr for lk in self.outputPageHashL.keys(): for rk in self.outputPageHashR.keys(): riter = iter(self.outputPageHashR[rk]) buf = self.storage.bufferPool M = buf.freeSpace() - 2 size = len(self.outputPageHashR[rk]) while size > 0: self.accessPageBlock(buf, riter) size -= M for (lPageId, lhsPage) in iter(self.outputPageHashL[lk]): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema( self.lhsSchema, lTuple) for pid in self.pidsInBlock: rhsPage = buf.getPage(pid, pinned=True) for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema( self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(evalStr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate( *[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] self.accessPageBlock(buf, riter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) def getRelId(self, hashVal, isLeft): tempstr = 'temp' if isLeft: tempstr = 'templ' return self.relationId() + tempstr + str(hashVal) def initializeOutputHash(self, hashVal, isLeft): relId = self.getRelId(hashVal, isLeft) if self.storage.hasRelation(relId): self.storage.removeRelation(relId) if isLeft: self.storage.createRelation(relId, self.lhsSchema) self.tempFileHashL[hashVal] = self.storage.fileMgr.relationFile( relId)[1] self.outputPageHashL[hashVal] = [] else: self.storage.createRelation(relId, self.rhsSchema) self.tempFileHashR[hashVal] = self.storage.fileMgr.relationFile( relId)[1] self.outputPageHashR[hashVal] = [] def emitOutputTupleHash(self, tupleData, hashVal, isLeft): if isLeft: if hashVal not in self.tempFileHashL.keys(): self.initializeOutputHash(hashVal, isLeft) else: if hashVal not in self.tempFileHashR.keys(): self.initializeOutputHash(hashVal, isLeft) self.currFile = self.tempFileHashR[hashVal] self.currOutputPages = self.outputPageHashR[hashVal] if isLeft: self.currFile = self.tempFileHashL[hashVal] self.currOutputPages = self.outputPageHashL[hashVal] allocatePage = not ( self.currOutputPages and (self.currOutputPages)[-1][1].header.hasFreeTuple()) if allocatePage: # Flush the most recently updated output page, which updates the storage file's # free page list to ensure correct new page allocation. if self.currOutputPages: self.storage.bufferPool.flushPage( (self.currOutputPages)[-1][0]) outputPageId = self.currFile.availablePage() outputPage = self.storage.bufferPool.getPage(outputPageId) self.currOutputPages.append((outputPageId, outputPage)) else: outputPage = (self.currOutputPages)[-1][1] outputPage.insertTuple(tupleData) if self.sampled: self.estimatedCardinality += 1 else: self.actualCardinality += 1 def printerr(self, string): f = open('err.txt', 'a') f.write(str(string) + '\n') f.close() # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete group-by specification, missing a required parameter") if not self.aggExprs: raise ValueError("Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): relIdMap = {} # Perform partition using hash function self.partition(relIdMap) # Perform group-by operation for hashValue, relId in relIdMap.items(): pageIter = self.storage.pages(relId) aggregationResults = {} # Stores intermediate aggregation results for _, page in pageIter: for tupleP in page: tupleU = self.subSchema.unpack(tupleP) gbVal = self.getGroupByValue(tupleU) # Get intermediate results for this group-by value intermediateResults = aggregationResults.get(gbVal, None) if intermediateResults is None: intermediateResults = list() aggregationResults[gbVal] = intermediateResults for aggExpr in self.aggExprs: # Form a list of initial values intermediateResults.append(aggExpr[0]) idx = 0 for aggExpr in self.aggExprs: # Perform aggregation by applying the lambda function (aggExpr[1]) intermediateResult = intermediateResults[idx] intermediateResults[idx] = aggExpr[1](intermediateResult, tupleU) idx = idx + 1 for gbVal, intermediateResults in aggregationResults.items(): idx = 0 for aggExpr in self.aggExprs: # Perform final step by applying the lambda function (aggExpr[2]) intermediateResult = intermediateResults[idx] intermediateResults[idx] = aggExpr[2](intermediateResult) idx = idx + 1 outputList = itertools.chain([gbVal[0]], intermediateResults) outputTuple = self.outputSchema.instantiate(*outputList) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Remove partitions for _, relId in relIdMap.items(): self.storage.removeRelation(relId) return self.storage.pages(self.relationId()) # Partitions a given relation based on some hash function def partition(self, relIdMap): for (pageId, page) in iter(self.subPlan): for tupleP in page: # Compute hash value for every tuple tupleU = self.subSchema.unpack(tupleP) hashVal = self.groupHashFn(self.getGroupByValue(tupleU)) # Store in temporary buckets (files) if not hashVal in relIdMap: relId = str(self.id()) + "_grp_" + str(hashVal) self.storage.createRelation(relId, self.subSchema) relIdMap[hashVal] = relId self.storage.insertTuple(relIdMap[hashVal], tupleP) def getGroupByValue(self, unpackedTuple): gbVal = self.groupExpr(unpackedTuple) return gbVal if type(gbVal) is tuple else gbVal, # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
bp.setFileManager(fm) fm.createRelation(schema.name, schema) (fId, f) = fm.relationFile(schema.name) f.numPages() == 0 pId = PageId(fId, 0) pId1 = PageId(fId, 1) p = Page(pageId=pId, buffer=bytes(f.pageSize()), schema=schema) p1 = Page(pageId=pId1, buffer=bytes(f.pageSize()), schema=schema) for tup in [schema.pack(schema.instantiate(i, 2*i+20)) for i in range(10)]: _ = p.insertTuple(tup) for tup in [schema.pack(schema.instantiate(i, i+20)) for i in range(10, 20)]: _ = p1.insertTuple(tup) f.writePage(p) f.writePage(p1) print(p.header.usedSpace()) h1 = f.readPageHeader( pId ); print(h1) print(h1.tupleSize) print(h1.freeSpaceOffset) print(h1.pageCapacity) print(h1.usedSpace()) print(f.numPages() == 2)
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() def localCost(self, estimated): tupleSize = self.subPlan.schema().size numTuples = self.subPlan.cardinality(estimated) pageSize = self.storage.bufferPool.pageSize numPages = (tupleSize * numTuples) // pageSize return 2 * numTuples * self.tupleCost #return 2 * numPages #derived from: http://www4.comp.polyu.edu.hk/~csmlyiu/conf/CIKM09_skygroup.pdf with the assumption that G=1 and therefore the log value will be close to 1 # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete group-by specification, missing a required parameter") if not self.aggExprs: raise ValueError("Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.partitionFiles = {} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Processing helpers def ensureTuple(self, x): if not isinstance(x, tuple): return (x,) else: return x def initialExprs(self): return [i[0] for i in self.aggExprs] def incrExprs(self): return [i[1] for i in self.aggExprs] def finalizeExprs(self): return [i[2] for i in self.aggExprs] # Set-at-a-time operator processing def processAllPages(self): # Create partitions of the input records by hashing the group-by values for (pageId, page) in self.subPlan: for tup in page: groupVal = self.ensureTuple(self.groupExpr(self.subSchema.unpack(tup))) groupId = self.groupHashFn(groupVal) self.emitPartitionTuple(groupId, tup) # We assume that the partitions fit in main memory. for partRelId in self.partitionFiles.values(): partFile = self.storage.fileMgr.relationFile(partRelId)[1] # Use an in-memory Python dict to accumulate the aggregates. aggregates = {} for (pageId, page) in partFile.pages(): for tup in page: # Evaluate group-by value. namedTup = self.subSchema.unpack(tup) groupVal = self.ensureTuple(self.groupExpr(namedTup)) # Look up the aggregate for the group. if groupVal not in aggregates: aggregates[groupVal] = self.initialExprs() # Increment the aggregate. aggregates[groupVal] = \ list(map( \ lambda x: x[0](x[1], namedTup), \ zip(self.incrExprs(), aggregates[groupVal]))) # Finalize the aggregate value for each group. for (groupVal, aggVals) in aggregates.items(): finalVals = list(map(lambda x: x[0](x[1]), zip(self.finalizeExprs(), aggVals))) outputTuple = self.outputSchema.instantiate(*(list(groupVal) + finalVals)) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator for the output file. return self.storage.pages(self.relationId()) # Bucket construction helpers. def partitionRelationId(self, partitionId): return self.operatorType() + str(self.id()) + "_" \ + "part_" + str(partitionId) def emitPartitionTuple(self, partitionId, partitionTuple): partRelId = self.partitionRelationId(partitionId) # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, self.subSchema) self.partitionFiles[partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Delete all existing partition files. def removePartitionFiles(self): for partRelId in self.partitionFiles.values(): self.storage.removeRelation(partRelId) self.partitionFiles = {} # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError( "Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() self.tempFileHash = dict() self.outputPageHash = dict() self.tempFile = None #self.outputSchema = self.aggSchema # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete group-by specification, missing a required parameter" ) if not self.aggExprs: raise ValueError( "Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.iterator = iter(self.subPlan) self.acc = dict() self.outputIterator = self.processAllPages() return self def __next__(self): if self.pipelined: return self.outputPage() else: return next(self.outputIterator) raise StopIteration ''' (PageId, Page) = next(self.iterator) #self.printerr('memes') #self.printerr(self.schema().schema()) #for k in self.acc.keys(): # self.emitOutputTuple(self.aggExprs[2](self.acc[k])) #raise StopIteration for Tuple in Page: # Load the lhs once per inner loop. val = self.loadSchema(self.subSchema, Tuple) temp = namedtuple('temp',val.keys()) l = list() for k in val.keys(): l.append(val[k]) ntup = temp._make(l) expr = self.groupExpr(ntup) #self.printerr(expr) hash = self.groupHashFn((expr,0)) #self.printerr(hash) #if hash not in self.acc.keys(): # self.acc[hash] = self.aggExprs[0] #self.acc[hash] = self.aggExprs[1](self.acc[hash],val) #ids = [] #for tup in self.groupSchema.schema(): # ids.append(tup[0]) #temp = namedtuple('temp',ids) #l = list() #for k in ids: # l.append(val[k]) #ntup = temp._make(l) #self.printerr(hash) self.emitOutputTupleHash(ntup, hash)''' # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): self.initializeSchema() self.acc = dict() for (PageId, Page) in iter(self.subPlan): for Tuple in Page: # Load the lhs once per inner loop. val = self.loadSchema(self.subSchema, Tuple) ntup = self.subSchema.instantiate( *[val[f] for f in self.subSchema.fields]) expr = self.groupExpr(ntup) #self.printerr(expr) hash = self.groupHashFn((expr, 0)) self.emitOutputTupleHash(Tuple, hash) for k in self.outputPageHash.keys(): acc = dict() for i, outSchema in enumerate(self.aggSchema.schema()): acc[outSchema[0]] = self.aggExprs[i][0] for pinfo in self.outputPageHash[k]: page = self.storage.bufferPool.getPage(pinfo[0]) for tup in page: val = self.loadSchema(self.subSchema, tup) temp = namedtuple('temp', val.keys()) l = list() for k in val.keys(): l.append(val[k]) ntup = temp._make(l) for i, outSchema in enumerate(self.groupSchema.schema()): acc[outSchema[0]] = self.groupExpr(ntup) for i, outSchema in enumerate(self.aggSchema.schema()): acc[outSchema[0]] = self.aggExprs[i][2]( self.aggExprs[i][1](acc[outSchema[0]], ntup)) outputTuple = self.outputSchema.instantiate( *[acc[f] for f in self.outputSchema.fields]) #self.printerr(outputTuple) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) #if self.outputPages: #self.outputPages = [self.outputPages[-1]] return self.storage.pages(self.relationId()) def getRelId(self, hashVal): return self.relationId() + 'temp' + str(hashVal) def initializeOutputHash(self, hashVal): relId = self.getRelId(hashVal) if self.storage.hasRelation(relId): self.storage.removeRelation(relId) self.storage.createRelation(relId, self.subSchema) self.tempFileHash[hashVal] = self.storage.fileMgr.relationFile( relId)[1] self.outputPageHash[hashVal] = [] def emitOutputTupleHash(self, tupleData, hashVal): if hashVal not in self.tempFileHash.keys(): self.initializeOutputHash(hashVal) self.currFile = self.tempFileHash[hashVal] self.currOutputPages = self.outputPageHash[hashVal] allocatePage = not ( self.outputPageHash[hashVal] and (self.outputPageHash[hashVal])[-1][1].header.hasFreeTuple()) if allocatePage: # Flush the most recently updated output page, which updates the storage file's # free page list to ensure correct new page allocation. if self.outputPageHash[hashVal]: self.storage.bufferPool.flushPage( (self.outputPageHash[hashVal])[-1][0]) outputPageId = self.currFile.availablePage() outputPage = self.storage.bufferPool.getPage(outputPageId) self.outputPageHash[hashVal].append((outputPageId, outputPage)) else: outputPage = (self.outputPageHash[hashVal])[-1][1] outputPage.insertTuple(tupleData) #if self.sampled: # self.estimatedCardinality += 1 #else: # self.actualCardinality += 1 def printerr(self, string): f = open('err.txt', 'a') f.write(str(string) + '\n') f.close() # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None \ or self.storage.getIndex(self.indexId) is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput(); return self.processAllPages(); def __next__(self): return next(self.storage.pages(self.relationId())); # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. def blockNestedLoops(self): bufPool = self.storage.bufferPool; lSchema = self.inputSchemas()[0]; rSchema = self.inputSchemas()[1]; lhsKey = self.joinExpr.split('==')[0].strip(); rhsKey = self.joinExpr.split('==')[1].strip(); self.cleanBufferPool(bufPool); self.logger("starting...") for pageBlock in self.accessPageBlock(bufPool, iter(self.lhsPlan)): self.logger("one new pageBlock..."); hasher = dict(); for lPageId in pageBlock: lhsPage = bufPool.getPage(lPageId); for lTuple in iter(lhsPage): tupleObj = lSchema.unpack(lTuple); key = getattr(tupleObj, lhsKey); if key in hasher: hasher[key].append(lTuple); else: hasher[key] = [lTuple]; for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in iter(rhsPage): tupleObj = rSchema.unpack(rTuple); key = getattr(tupleObj, rhsKey); if key in hasher: joinExprEnv = self.loadSchema(rSchema, rTuple); for lTuple in hasher[key]: joinExprEnv.update(self.loadSchema(lSchema, lTuple)); outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]); outputTupleP = self.joinSchema.pack(outputTuple); self.storage.fileMgr.relationFile(self.relationId())[1].insertTuple(outputTupleP); for lPageId in pageBlock: self.storage.bufferPool.unpinPage(lPageId); self.storage.bufferPool.discardPage(lPageId); self.cleanBufferPool(bufPool); del hasher; self.logger("ending..."); return self.storage.pages(self.relationId()); # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): self.cleanBufferPool( bufPool ); pageBlock = []; self.inputFinished = False; while not(self.inputFinished): try: (pageId, page) = next(pageIterator); if (bufPool.numFreePages() > 2): _ = bufPool.getPage(pageId); bufPool.pinPage(pageId); pageBlock.append(pageId); else: yield pageBlock; pageBlock = []; except StopIteration: self.inputFinished = True; yield pageBlock; ################################## # # Indexed nested loops implementation # def indexedNestedLoops(self): raise NotImplementedError ################################## # # Some helper function # # clean buffer pool before use def cleanBufferPool(self, bufPool): # evict out clean pages and flush dirty pages for (pageId, (_, page, pinCount)) in bufPool.pageMap.items(): if not(pinCount == 0): raise RuntimeError("Unable to clean bufferpool. Memory leaks?"); else: if (page.isDirty()): # evict with flush bufPool.flushPage( pageId ); # evict without flush bufPool.discardPage( pageId ); ################################## # # Hash join implementation. # def hashJoin(self): if self.joinExpr == None: self.joinExpr = self.lhsKeySchema.fields[0] + "==" + self.rhsKeySchema.fields[0]; self.tmpFilesL = list(); self.tmpFilesR = list(); bufPool = self.storage.bufferPool; self.logger("start..."); self.cleanBufferPool(bufPool); tmpFilesL = dict(); tmpFilesR = dict(); self.logger("building L partition"); for (PageId, Page) in iter(self.lhsPlan): self.buildPartitionL(PageId, Page, tmpFilesL); self.logger("building R partition"); for (PageId, Page) in iter(self.rhsPlan): self.buildPartitionR(PageId, Page, tmpFilesR); # Schema prep lSchema = self.inputSchemas()[0]; rSchema = self.inputSchemas()[1]; for relIdLKey in tmpFilesL.keys(): # Clean up before running. if relIdLKey in tmpFilesR: (_, relIdTmpR) = tmpFilesR[ relIdLKey ]; (_, relIdTmpL) = tmpFilesL[ relIdLKey ]; else: continue; self.cleanBufferPool( bufPool ); lhsPlan = TableScan(relIdTmpL, self.inputSchemas()[0]); rhsPlan = TableScan(relIdTmpR, self.inputSchemas()[1]); lhsPlan.storage = self.storage; rhsPlan.storage = self.storage; self.lhsPlan = lhsPlan; self.rhsPlan = rhsPlan; for lPageId in pageBlock: lhsPage = bufPool.getPage(lPageId); for ltuple in iter( lhsPage ): tupleObj = lSchema.unpack( ltuple ); key = lSchema.project( tupleObj, self.lhsKeySchema )[0]; if key in hasher: hasher[ key ].append( ltuple ); else: hasher[ key ] = [ ltuple ]; # iterating all rtuples to pack output for (rPageId, rhsPage) in iter(rhsPlan): print( rPageId.pageIndex ); for rTuple in iter( rhsPage ): tupleObj = rSchema.unpack( rTuple ); print( tupleObj ); key = rSchema.project( tupleObj, self.rhsKeySchema )[0]; if key in hasher: for lTuple in hasher[ key ]: joinIns = self.loadSchema( lSchema, lTuple ) joinIns.update( self.loadSchema( rSchema, rTuple ) ); outputTuple = self.joinSchema.instantiate(*[joinIns[f] for f in self.joinSchema.fields]); print( outputTuple ); outputTupleP = self.joinSchema.pack(outputTuple); self.storage.fileMgr.relationFile(self.relationId())[1].insertTuple(outputTupleP); for lPageId in pageBlock: bufPool.unpinPage(lPageId); bufPool.discardPage(lPageId); self.cleanBufferPool(bufPool); del hasher; _ = self.blockNestedLoops(); self.storage.removeRelation(relIdTmpL); self.storage.removeRelation(relIdTmpR);
db = Database.Database() deptSchema = DBSchema('department', [('d_id', 'int'), ('d_name', 'char(30)')]); emplSchema = DBSchema('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')]) projSchema = DBSchema('project', [('p_id','int'), ('p_name', 'char(30)')]) gratSchema = DBSchema('grant', [('g_id','int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) synSchema1 = DBSchema('syn1', [('a','int'), ('b', 'char(30)')]) synSchema2 = DBSchema('syn2', [('c','int'), ('d', 'char(30)'), ('e','int')]) db.createRelation('department', [('d_id', 'int'), ('d_name', 'char(30)')]) db.createRelation('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')]) db.createRelation('project', [('p_id','int'), ('p_name', 'char(30)')]) db.createRelation('grant', [('g_id','int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) db.createRelation('syn1', [('a','int'), ('b', 'char(30)')]); db.createRelation('syn2', [('c','int'), ('d', 'char(30)'), ('e','int')]); for tup in [deptSchema.pack(deptSchema.instantiate(i, "Nature"+str(i))) for i in range(4000)]: _ = db.insertTuple('department', tup); for tup in [deptSchema.pack(deptSchema.instantiate(i, "Science"+str(i))) for i in range(4000, 8000)]: _ = db.insertTuple('department', tup); ename = ["John", "Mike", "Davis", "Alex"]; for tup in [emplSchema.pack(emplSchema.instantiate(i, ename[i%4], i%10)) for i in range(8000)]: _ = db.insertTuple('employee', tup); projectName = ["CS","EE","Biophysics","Biostats","NeuroScience", "Cell Biology"]; for tup in [projSchema.pack(projSchema.instantiate(i, projectName[i%6])) for i in range(8000)]: _ = db.insertTuple('project', tup); sourceName = ["NIH","NSF","Apple","Microsoft","Google"]; for tup in [gratSchema.pack(gratSchema.instantiate(i, i%2000, sourceName[i%5])) for i in range(8000)]: _ = db.insertTuple('grant', tup); for tup in [synSchema1.pack(synSchema1.instantiate(i, sourceName[i%3])) for i in range(8000)]: _ = db.insertTuple('syn1', tup); for tup in [synSchema2.pack(synSchema2.instantiate(i, sourceName[i%5], i%500)) for i in range(8000)]:
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): raise NotImplementedError def __next__(self): raise NotImplementedError # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): raise NotImplementedError def blockNestedLoops(self): raise NotImplementedError ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): raise NotImplementedError # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None \ or self.storage.getIndex(self.indexId) is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): relId = self.relationId() if self.storage.hasRelation(relId): return self.storage.pages(relId) self.initializeOutput() self.partitionFiles = {0: {}, 1: {}} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pageBlock = [] try: while True: (pageId, page) = next(pageIterator) pageBlock.append((pageId, page)) bufPool.pinPage(pageId) if bufPool.numFreePages() == 0: break except StopIteration: pass return pageBlock def blockNestedLoops(self): # Access the outer relation's block, pinning pages in the buffer pool. bufPool = self.storage.bufferPool lhsIter = iter(self.lhsPlan) lPageBlock = self.accessPageBlock(bufPool, lhsIter) while lPageBlock: for (lPageId, lhsPage) in lPageBlock: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Unpin the page after joining with the RHS relation. # Thus future accesses can evict the page while reading the next block. bufPool.unpinPage(lPageId) # Move to the next page block after processing it. lPageBlock = self.accessPageBlock(bufPool, lhsIter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): if self.indexId: bufPool = self.storage.bufPool for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) # Match against RHS tuples using the index. joinKey = self.lhsSchema.projectBinary( lTuple, self.lhsKeySchema) matches = self.storage.lookupByIndex(self.indexId, joinKey) for rhsTupId in matches: rhsPage = bufPool.getPage(rhsTupId.pageId) rTuple = rhsPage.getTuple(rhsTupId) # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate any remaining join predicate, and output if we have a match. fullMatch = eval( self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True if fullMatch: outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) else: raise ValueError( "No index found while using an indexed nested loops join") ################################## # # Hash join implementation. # def hashJoin(self): # Partition the LHS and RHS inputs, creating a temporary file for each partition. # We assume one-level of partitioning is sufficient and skip recurring. for (lPageId, lPage) in iter(self.lhsPlan): for lTuple in lPage: lPartEnv = self.loadSchema(self.lhsSchema, lTuple) lPartKey = eval(self.lhsHashFn, globals(), lPartEnv) self.emitPartitionTuple(lPartKey, lTuple, left=True) for (rPageId, rPage) in iter(self.rhsPlan): for rTuple in rPage: rPartEnv = self.loadSchema(self.rhsSchema, rTuple) rPartKey = eval(self.rhsHashFn, globals(), rPartEnv) self.emitPartitionTuple(rPartKey, rTuple, left=False) # Iterate over partition pairs and output matches # evaluating the join expression as necessary. for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs(): for lTuple in lPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for rTuple in rPage: joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) output = \ ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \ == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \ and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True ) if output: outputTuple = self.joinSchema.instantiate( *[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator to the output relation return self.storage.pages(self.relationId()) # Hash join helpers. def partitionRelationId(self, left, partitionId): return self.operatorType() + str(self.id()) + "_" \ + ("l" if left else "r") + "part_" + str(partitionId) + str(self.opMarker) def emitPartitionTuple(self, partitionId, partitionTuple, left=False): partRelId = self.partitionRelationId(left, partitionId) partSchema = self.lhsSchema if left else self.rhsSchema # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, partSchema) self.partitionFiles[int(left)][partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Return pairs of pages from matching partitions. def partitionPairs(self): lKeys = self.partitionFiles[1].keys() rKeys = self.partitionFiles[0].keys() matches = [(self.partitionFiles[1][partId], self.partitionFiles[0][partId]) \ for partId in lKeys if partId in rKeys] return PartitionIterator(matches, self.storage) # Delete all existing partition files. def removePartitionFiles(self): for lPartRelId in self.partitionFiles[0].values(): self.storage.removeRelation(lPartRelId) for rPartRelId in self.partitionFiles[1].values(): self.storage.removeRelation(rPartRelId) self.partitionFiles = {0: {}, 1: {}} # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs # We override the cost model here. # This cost model cannot be compatible with the general # operators' costs so that it is not used during Join # Optimization. def localCost(self, estimated): if estimated: l_inputPages = 0 r_inputPages = 0 fileSize = 0 pageBlockNum = 0 try: _, l_inputPages, _ = self.storage.relationStats( self.lhsPlan.relationId()) _, r_inputPages, _ = self.storage.relationStats( self.rhsPlan.relationId()) pageBlockNum = math.ceil(l_inputPages / self.storage.bufferPool.numPages()) except: pass l_inputPages *= self.sampleFactor r_inputPages *= self.sampleFactor if (self.joinMethod == "nested-loops"): local_cost = l_inputPages + self.lhsPlan.cardinality * r_inputPages elif (self.joinMethod == "block-nested-loops"): local_cost = l_inputPages + pageBlockNum * r_inputPages # We don't support indexed # elif (self.joinMethod == "indexed"): # index_pages = self.storage.fileMgr.getIndex(self.indexID).numPages(); Not verified with BDB index file # rmatch_pages = ? # local_cost = l_inputPages + self.lhsPlan.cardinality * (index_pages + rmatch_pages); elif (self.joinMethod == "hash"): local_cost = 3 * (l_inputPages + r_inputPages) return local_cost
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() # Pipelined join operator is not supported self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def cleanBufferPool(self, bufPool): items = list(bufPool.pageMap.items()) for (pageId, (offset, page, pinned)) in items: if pinned > 0: continue elif page.isDirty(): bufPool.flushPage(pageId) else: bufPool.discardPage(pageId) def accessPageBlock(self, bufPool, pageIterator): block_pageList = [] self.cleanBufferPool(bufPool) inputNotFinished = True try: while inputNotFinished: (pageId, page) = next(pageIterator) bufPool.getPage(pageId) block_pageList.append(pageId) if bufPool.numFreePages() > 2: inputNotFinished = False except StopIteration: pass return block_pageList def blockNestedLoops(self): pageIterator = iter(self.lhsPlan) bufPool = self.storage.bufferPool block_pageList = self.accessPageBlock(bufPool, pageIterator) while block_pageList: for pageId in block_pageList: lhsPage = bufPool.getPage(pageId) for lTuple in lhsPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] bufPool.unpinPage(pageId) block_pageList = self.accessPageBlock(bufPool, pageIterator) return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): tmpFileL = dict() tmpFileR = dict() bufPool = self.storage.bufferPool self.partition(self.lhsPlan, self.lhsSchema, self.lhsHashFn, tmpFileL, "lhs") self.partition(self.rhsPlan, self.rhsSchema, self.rhsHashFn, tmpFileR, "rhs") if not self.joinExpr: left_id = self.lhsHashFn.split('(')[1].split(')')[0].strip() right_id = self.rhsHashFn.split('(')[1].split(')')[0].strip() self.joinExpr = left_id + ' == ' + right_id for key in tmpFileL.keys(): if key in tmpFileR: relTmpL = tmpFileL[key] relTmpR = tmpFileR[key] pageIterator_lhs = self.storage.pages(relTmpL) for (lPageId, lhsPage) in pageIterator_lhs: for lTuple in lhsPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) pageIterator_rhs = self.storage.pages(relTmpR) for (rPageId, rhsPage) in pageIterator_rhs: for rTuple in rhsPage: joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate( *[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] else: continue self.cleanBufferPool(bufPool) for relTmp in tmpFileL.items(): self.storage.removeRelation(relTmp) for relTmp in tmpFileR.items(): self.storage.removeRelation(relTmp) return self.storage.pages(self.relationId()) def partition(self, plan, schema, hashFn, tmpFile, relPrefix): for (pageId, page) in iter(plan): for Tuple in page: fieldBindings = self.loadSchema(schema, Tuple) hashValue = eval(hashFn, globals(), fieldBindings) # Store in temporary buckets (files) if not hashValue in tmpFile: relId = str( self.id()) + "_" + relPrefix + "_" + str(hashValue) self.storage.createRelation(relId, schema) tmpFile[hashValue] = relId self.storage.insertTuple(tmpFile[hashValue], Tuple) # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs
gratSchema = DBSchema('grant', [('g_id', 'int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) synSchema1 = DBSchema('syn1', [('a', 'int'), ('b', 'char(30)')]) synSchema2 = DBSchema('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')]) db.createRelation('department', [('d_id', 'int'), ('d_name', 'char(30)')]) db.createRelation('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')]) db.createRelation('project', [('p_id', 'int'), ('p_name', 'char(30)')]) db.createRelation('grant', [('g_id', 'int'), ('g_projectid', 'int'), ('g_source', 'char(30)')]) db.createRelation('syn1', [('a', 'int'), ('b', 'char(30)')]) db.createRelation('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')]) for tup in [ deptSchema.pack(deptSchema.instantiate(i, "Nature" + str(i))) for i in range(4000) ]: _ = db.insertTuple('department', tup) for tup in [ deptSchema.pack(deptSchema.instantiate(i, "Science" + str(i))) for i in range(4000, 8000) ]: _ = db.insertTuple('department', tup) ename = ["John", "Mike", "Davis", "Alex"] for tup in [ emplSchema.pack(emplSchema.instantiate(i, ename[i % 4], i % 10)) for i in range(8000) ]: _ = db.insertTuple('employee', tup) projectName = [
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError( "Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn] if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete group-by specification, missing a required parameter" ) if not self.aggExprs: raise ValueError( "Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): relations = [] for (pageId, page) in iter(self.subPlan): for tup in page: unpackedTup = self.subSchema.unpack(tup) groupByVal = tuple([self.groupExpr(unpackedTup)]) hashVal = str(self.groupHashFn(groupByVal)) if hashVal not in relations: self.storage.createRelation(hashVal, self.subSchema) relations.append(hashVal) self.storage.insertTuple(hashVal, tup) for rel in relations: for (pageId, page) in self.storage.pages(rel): groups = {} for tup in page: unpackedTup = self.subSchema.unpack(tup) groupByVal = tuple([self.groupExpr(unpackedTup)]) if groupByVal not in groups.keys(): groups[groupByVal] = [ aggExpr[0] for aggExpr in self.aggExprs ] for i, aggExpr in enumerate(self.aggExprs): groups[groupByVal][i] = aggExpr[1]( groups[groupByVal][i], unpackedTup) for groupByVal in groups.keys(): for i, aggExpr in enumerate(self.aggExprs): groups[groupByVal][i] = aggExpr[2]( groups[groupByVal][i]) outputTuple = self.outputSchema.instantiate( *it.chain(list(groupByVal), groups[groupByVal])) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] for rel in relations: self.storage.removeRelation(rel) return self.storage.pages(self.relationId()) # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError( "Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete group-by specification, missing a required parameter" ) if not self.aggExprs: raise ValueError( "Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() # Pipelined join operator is not supported self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): Map = dict() # partition the schema into several files in different attributes self.partition(Map) for key, title in Map.items(): # Generate a pageIterator in the file pageIterator = self.storage.pages(title) # Generate an dictionary on intermediate aggregation results aggregator = {} # Get the tuple in the page for _, page in pageIterator: for Tuple in page: tuple_Unpacked = self.subSchema.unpack(Tuple) key = self.groupExpr(tuple_Unpacked) if type(key) is tuple: key = key else: key = key, val = self.groupHashFn(key) intermediate_results = aggregator.get(val, None) # if the intermediate_result has not generated, form one if not intermediate_results: intermediate_results = list() aggregator[val] = intermediate_results for aggExpr in self.aggExprs: intermediate_results.append(aggExpr[0]) index = 0 # Perform the aggregation function for aggExpr in self.aggExprs: intermediate_result = intermediate_results[index] intermediate_results[index] = aggExpr[1]( intermediate_result, tuple_Unpacked) index += 1 for val, intermediate_results in aggregator.items(): index = 0 for aggExpr in self.aggExprs: intermediate_result = intermediate_results[index] intermediate_results[index] = aggExpr[2]( intermediate_result) index += 1 outputList = itertools.chain([val], intermediate_results) outputTuple = self.outputSchema.instantiate(*outputList) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] # remove the temporary relation created for _, title in Map.items(): self.storage.removeRelation(title) return self.storage.pages(self.relationId()) def partition(self, relMap): for (pageId, page) in iter(self.subPlan): for Tuple in page: tuple_Unpacked = self.subSchema.unpack(Tuple) key = self.groupExpr(tuple_Unpacked) if type(key) is tuple: key = key else: key = key, value = self.groupHashFn(key) # if this key is not in relation map, we should create a temperory file to contain these tuples with this key if not value in relMap: title = str(self.id()) + "_grp_" + str(value) self.storage.createRelation(title, self.subSchema) relMap[value] = title self.storage.insertTuple(relMap[value], Tuple) # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError( "Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete group-by specification, missing a required parameter" ) if not self.aggExprs: raise ValueError( "Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.partitionFiles = {} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Processing helpers def ensureTuple(self, x): if not isinstance(x, tuple): return (x, ) else: return x def initialExprs(self): return [i[0] for i in self.aggExprs] def incrExprs(self): return [i[1] for i in self.aggExprs] def finalizeExprs(self): return [i[2] for i in self.aggExprs] # Set-at-a-time operator processing def processAllPages(self): # Create partitions of the input records by hashing the group-by values for (pageId, page) in self.subPlan: for tup in page: groupVal = self.ensureTuple( self.groupExpr(self.subSchema.unpack(tup))) groupId = self.groupHashFn(groupVal) self.emitPartitionTuple(groupId, tup) # We assume that the partitions fit in main memory. for partRelId in self.partitionFiles.values(): partFile = self.storage.fileMgr.relationFile(partRelId)[1] # Use an in-memory Python dict to accumulate the aggregates. aggregates = {} for (pageId, page) in partFile.pages(): for tup in page: # Evaluate group-by value. namedTup = self.subSchema.unpack(tup) groupVal = self.ensureTuple(self.groupExpr(namedTup)) # Look up the aggregate for the group. if groupVal not in aggregates: aggregates[groupVal] = self.initialExprs() # Increment the aggregate. aggregates[groupVal] = \ list(map( \ lambda x: x[0](x[1], namedTup), \ zip(self.incrExprs(), aggregates[groupVal]))) # Finalize the aggregate value for each group. for (groupVal, aggVals) in aggregates.items(): finalVals = list( map(lambda x: x[0](x[1]), zip(self.finalizeExprs(), aggVals))) outputTuple = self.outputSchema.instantiate(*(list(groupVal) + finalVals)) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator for the output file. return self.storage.pages(self.relationId()) # Bucket construction helpers. def partitionRelationId(self, partitionId): return self.operatorType() + str(self.id()) + "_" \ + "part_" + str(partitionId) def emitPartitionTuple(self, partitionId, partitionTuple): partRelId = self.partitionRelationId(partitionId) # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, self.subSchema) self.partitionFiles[partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Delete all existing partition files. def removePartitionFiles(self): for partRelId in self.partitionFiles.values(): self.storage.removeRelation(partRelId) self.partitionFiles = {} # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")" def localCost(self, estimated): t = self.subPlan.cardinality(estimated) p = t / (self.storage.bufferPool.pageSize / self.subPlan.schema().size) return p + p
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) def localCost(self, estimated): tupleSizeLeft = self.lhsPlan.schema().size numTuplesLeft = self.lhsPlan.cardinality(estimated) tupleSizeRight = self.rhsPlan.schema().size numTuplesRight = self.rhsPlan.cardinality(estimated) pageSize = self.storage.bufferPool.pageSize numPagesLeft = (tupleSizeLeft * numTuplesLeft) // pageSize numPagesRight = (tupleSizeRight * numTuplesRight) // pageSize if self.joinMethod == "nested-loops": return (numTuplesLeft * self.tupleCost * numTuplesRight * self.tupleCost) + (numTuplesLeft * self.tupleCost) #return (numTuplesLeft * numPagesRight) + numPagesLeft elif self.joinMethod == "block-nested-loops": return (numTuplesLeft * self.tupleCost) + (((numTuplesLeft * self.tupleCost)// (self.storage.bufferPool.numPages() - 2)) * (numTuplesRight * self.tupleCost)) #return numPagesLeft + ((numPagesLeft // (self.storage.bufferPool.numPages() - 2)) * numPagesRight) elif self.joinMethod == "indexed": raise NotImplementedError elif self.joinMethod == "hash": return 3 * ((numTuplesLeft * self.tupleCost) + (numTuplesRight * self.tupleCost)) else: return None # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.partitionFiles = {0:{}, 1:{}} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in self.lhsPlan: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in self.rhsPlan: for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pageBlock = [] try: while True: (pageId, page) = next(pageIterator) pageBlock.append((pageId, page)) bufPool.pinPage(pageId) if bufPool.numFreePages() == 0: break except StopIteration: pass return pageBlock def blockNestedLoops(self): # Access the outer relation's block, pinning pages in the buffer pool. bufPool = self.storage.bufferPool lhsIter = iter(self.lhsPlan) lPageBlock = self.accessPageBlock(bufPool, lhsIter) while lPageBlock: for (lPageId, lhsPage) in lPageBlock: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in self.rhsPlan: for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Unpin the page after joining with the RHS relation. # Thus future accesses can evict the page while reading the next block. bufPool.unpinPage(lPageId) # Move to the next page block after processing it. lPageBlock = self.accessPageBlock(bufPool, lhsIter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): if self.storage.getIndex(self.indexId) is None: raise ValueError("Missing index in storage manager: %s" % self.indexId) if self.indexId: bufPool = self.storage.bufferPool for (lPageId, lhsPage) in self.lhsPlan: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) # Match against RHS tuples using the index. joinKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey) for rhsTupId in matches: rhsPage = bufPool.getPage(rhsTupId.pageId) rTuple = rhsPage.getTuple(rhsTupId) # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate any remaining join predicate, and output if we have a match. fullMatch = eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True if fullMatch: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) else: raise ValueError("No index found while using an indexed nested loops join") ################################## # # Hash join implementation. # def hashJoin(self): # Partition the LHS and RHS inputs, creating a temporary file for each partition. # We assume one-level of partitioning is sufficient and skip recurring. for (lPageId, lPage) in self.lhsPlan: for lTuple in lPage: lPartEnv = self.loadSchema(self.lhsSchema, lTuple) lPartKey = eval(self.lhsHashFn, globals(), lPartEnv) self.emitPartitionTuple(lPartKey, lTuple, left=True) for (rPageId, rPage) in self.rhsPlan: for rTuple in rPage: rPartEnv = self.loadSchema(self.rhsSchema, rTuple) rPartKey = eval(self.rhsHashFn, globals(), rPartEnv) self.emitPartitionTuple(rPartKey, rTuple, left=False) # Iterate over partition pairs and output matches # evaluating the join expression as necessary. for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs(): for lTuple in lPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for rTuple in rPage: joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) output = \ ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \ == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \ and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True ) if output: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator to the output relation return self.storage.pages(self.relationId()) # Hash join helpers. def partitionRelationId(self, left, partitionId): return self.operatorType() + str(self.id()) + "_" \ + ("l" if left else "r") + "part_" + str(partitionId) def emitPartitionTuple(self, partitionId, partitionTuple, left=False): partRelId = self.partitionRelationId(left, partitionId) partSchema = self.lhsSchema if left else self.rhsSchema # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, partSchema) self.partitionFiles[int(left)][partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Return pairs of pages from matching partitions. def partitionPairs(self): lKeys = self.partitionFiles[0].keys() rKeys = self.partitionFiles[1].keys() matches = [(self.partitionFiles[0][partId], self.partitionFiles[1][partId]) \ for partId in lKeys if partId in rKeys] return PartitionIterator(matches, self.storage) # Delete all existing partition files. def removePartitionFiles(self): for lPartRelId in self.partitionFiles[0].values(): self.storage.removeRelation(lPartRelId) for rPartRelId in self.partitionFiles[1].values(): self.storage.removeRelation(rPartRelId) self.partitionFiles = {0:{}, 1:{}} # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.blockIds = [] self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() return iter(self.processAllPages()) def __next__(self): raise NotImplementedError # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): raise NotImplementedError def blockJoin(self): for lhsPageId in self.blockIds: lhsPage = self.storage.bufferPool.getPage(lhsPageId, pinned=True) for lTuple in lhsPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, _) in self.rhsPlan: rhsPage = self.storage.bufferPool.getPage(rPageId, pinned=True) # self.storage.bufferPool.pinPage(rPageId) for rTuple in rhsPage: joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) self.storage.bufferPool.unpinPage(rPageId) if self.outputPages: self.outputPages = [self.outputPages[-1]] def blockNestedLoops(self): bp = self.storage.bufferPool freePages = bp.numFreePages( ) + 1 # Save one free page for the page of rhs that we read in # Do we need to worry about the number of output pages that we create? for (lPageId, lhsPage) in iter(self.lhsPlan): bp.getPage(lPageId, pinned=True) self.blockIds.append(lPageId) if len( self.blockIds ) == freePages: # We've used all the pages available in the bufferPool self.blockJoin() for pId in self.blockIds: bp.unpinPage(pId) self.blockIds = [] self.blockJoin( ) # If we are able to add all the pages we want to our block for pId in self.blockIds: bp.unpinPage(pId) self.blockIds = [] return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): bp = self.storage.bufferPool lPartitions = self.partitionPlan('L', self.lhsPlan, self.lhsHashFn, self.lhsSchema, self.lhsKeySchema) rPartitions = self.partitionPlan('R', self.rhsPlan, self.rhsHashFn, self.rhsSchema, self.rhsKeySchema) for lKey in lPartitions: # Go through all the keys of our outer partition lRelId = lPartitions[lKey] lFile = self.storage.fileMgr.relationFile(lRelId)[1] if lKey in rPartitions: # Check if the key is in our inner partition. No need for 'eval' now rRelId = rPartitions[lKey] rFile = self.storage.fileMgr.relationFile(rRelId)[ 1] # Get the file of all tuples w the same hash lPages = lFile.pages(pinned=True) for (lPageId, lPage) in lPages: for lTuple in lPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) rPages = rFile.pages(pinned=True) for (rPageId, rPage) in rPages: for rTuple in rPage: lKeyCheck = self.lhsSchema.project( self.lhsSchema.unpack(lTuple), self.lhsKeySchema) rKeyCheck = self.rhsSchema.project( self.rhsSchema.unpack(rTuple), self.rhsKeySchema) if lKeyCheck == rKeyCheck: joinExprEnv.update( self.loadSchema( self.rhsSchema, rTuple)) outputTuple = self.joinSchema.instantiate( *[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) bp.unpinPage(rPageId) bp.unpinPage(lPageId) for key in lPartitions: self.storage.removeRelation(lPartitions[key]) for key in rPartitions: self.storage.removeRelation(rPartitions[key]) if self.outputPages: self.outputPages = [self.outputPages[-1]] return self.storage.pages(self.relationId()) # Partition both rhs and lhs into partition files # Read one partition file at a time # Block join each partition file def partitionPlan(self, planSide, plan, hashFn, planSchema, keySchema): partitionFiles = {} for (pageId, page) in plan: for tuple in page: joinExprEnv = self.loadSchema(planSchema, tuple) bucket = eval(hashFn, globals(), joinExprEnv) if bucket not in partitionFiles: relId = self.relationId() + '_' + planSide + '_' + str( bucket) if self.storage.hasRelation(relId): self.storage.removeRelation(relId) self.storage.createRelation(relId, planSchema) file = self.storage.fileMgr.relationFile(relId)[1] file.insertTuple(tuple) partitionFiles[bucket] = relId else: relId = partitionFiles[bucket] file = self.storage.fileMgr.relationFile(relId)[1] file.insertTuple(tuple) return partitionFiles # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs