示例#1
0
	def pickJoinOrder(self, plan):
		# Extract all base relations, along with any unary operators immediately above.
		base_relations = set(plan.root.inputs())

		# Extract all joins in original plan, they serve as the set of joins actually necessary.
		joins = plan.root.joins()

		# Define the dynamic programming table.
		optimal_plans = {}

		# Establish optimal access paths.
		for relation in base_relations:
			optimal_plans[frozenset((relation,))] = relation

		# Fill in the table.
		for i in range(2, len(base_relations) + 1):
			for subset in itertools.combinations(base_relations, i):

				# Build the set of candidate joins.
				candidate_joins = set()
				for candidate_relation in subset:
					candidate_joins.add((
						optimal_plans[frozenset(tuple_without(subset, candidate_relation))],
						optimal_plans[frozenset((candidate_relation,))]
					))

				# Find the best of the candidate joins.
				optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins)

		# Reconstruct the best plan, prepare and return.
		final_plan = Plan(root = optimal_plans[frozenset(base_relations)])
		final_plan.prepare(self.db)
		plan = final_plan
		return final_plan
示例#2
0
  def pickJoinOrder(self, plan):
    # Extract all base relations, along with any unary operators immediately above.
    base_relations = set(plan.sources)

    # Extract all joins in original plan, they serve as the set of joins actually necessary.
    joins = set(plan.joins)

    # Define the dynamic programming table.
    optimal_plans = {}

    # Establish optimal access paths.
    for relation in base_relations:
      optimal_plans[frozenset((relation,))] = relation

    # Fill in the table.
    for i in range(2, len(base_relations) + 1):
      for subset in itertools.combinations(base_relations, i):

        # Build the set of candidate joins.
        candidate_joins = set()
        for candidate_relation in subset:
          candidate_joins.add((
            optimal_plans[frozenset(tuple_without(subset, candidate_relation))],
            optimal_plans[frozenset((candidate_relation,))]
          ))

        # Find the best of the candidate joins.
        optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins)

    # Reconstruct the best plan, prepare and return.
    final_plan = Plan(root = optimal_plans[frozenset(base_relations)])
    final_plan.prepare(self.db)
    return final_plan
示例#3
0
	def get_best_join(self, candidates, required_joins):
		best_plan_cost = None
		best_plan = None
		for left, right in candidates:

			relevant_expr = None

			# Find the joinExpr corresponding to the current join candidate. If there is none, it's a
			# cartesian product.
			for join in required_joins:
				names = ExpressionInfo(join.joinExpr).getAttributes()
				if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names):
					relevant_expr = join.joinExpr
					break
			else:
				relevant_expr = 'True'

			# Construct a join plan for the current candidate, for each possible join algorithm.
			# TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods.
			for algorithm in ["nested-loops", "block-nested-loops", "hash"]:
				test_plan = Plan(root = Join(
					lhsPlan = left,
					rhsPlan = right,
					method = algorithm,
					expr = relevant_expr
				))

				# Prepare and run the plan in sampling mode, and get the estimated cost.
				test_plan.prepare(self.db)
				test_plan.sample(5.0)
				cost = test_plan.cost(estimated = True)

				# Update running best.
				if best_plan_cost is None or cost < best_plan_cost:
					best_plan_cost = cost
					best_plan = test_plan

		# Need to return the root operator rather than the plan itself, since it's going back into the
		# table.
		return best_plan.root
示例#4
0
  def get_best_join(self, candidates, required_joins):
    best_plan_cost = None
    best_plan = None
    for left, right in candidates:

      relevant_expr = None

      # Find the joinExpr corresponding to the current join candidate. If there is none, it's a
      # cartesian product.
      for join in required_joins:
        names = ExpressionInfo(join.joinExpr).getAttributes()
        if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names):
          relevant_expr = join.joinExpr
          break
      else:
        relevant_expr = 'True'

      # Construct a join plan for the current candidate, for each possible join algorithm.
      # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods.
      for algorithm in ["nested-loops", "block-nested-loops"]:
        test_plan = Plan(root = Join(
          lhsPlan = left,
          rhsPlan = right,
          method = algorithm,
          expr = relevant_expr
        ))

        # Prepare and run the plan in sampling mode, and get the estimated cost.
        test_plan.prepare(self.db)
        test_plan.sample(1.0)
        cost = test_plan.cost(estimated = True)

        # Update running best.
        if best_plan_cost is None or cost < best_plan_cost:
          best_plan_cost = cost
          best_plan = test_plan

    # Need to return the root operator rather than the plan itself, since it's going back into the
    # table.
    return best_plan.root
示例#5
0
    def pickJoinOrder(self, plan):
        joins, tableIDs, optimalSubPlans, fields, nubPlan = self.optimizerSetup(
            plan)
        # Joins is a list of joins
        # TableIDs is a list of the operator on top of a tableScan or the scan itself (Select, Projcect)
        # optimalSubPlan is a dictionary where the key is the top operator ID (from TableID) and val is the operator
        # fields is a dictionary where key is top operator ID, val is the dictionary of fields

        if len(joins) == 0:
            return plan

        numTables = 2

        while numTables <= len(tableIDs):
            print('NumTables: ', numTables)
            joinOrderings = itertools.combinations(tableIDs, numTables)

            # Check each ordering, check each join method
            # Start with two tables total
            # pick one as the LHS, one as the RHS

            for joinOrdering in joinOrderings:  # This iterates through subsets of size numTables
                bestCost = 1e99
                bestPlan = None

                for rhsID in joinOrdering:  # Eventually we'll even iterate through swapping 2-joins
                    lhsIDs = list(joinOrdering)
                    lhsIDs.remove(rhsID)  # Make this one the right side join
                    lhsKey = frozenset(lhsIDs)  # Key for optimalSubPlan dict
                    rhsKey = frozenset([rhsID])  # Key for optimalSubPlan dict

                    cachedLHS = optimalSubPlans[
                        lhsKey] if lhsKey in optimalSubPlans else None  # Get the optimal subPlan
                    cachedRHS = optimalSubPlans[
                        rhsKey]  # Get the optimal subPlan

                    if cachedLHS is None or cachedRHS is None:
                        continue

                    # Do we even care about doing this join?
                    allAttributes = []

                    for lhsID in lhsIDs:
                        allAttributes.extend(fields[frozenset([
                            lhsID
                        ])])  # These are all the attributes in the join
                    allAttributes.extend(fields[rhsKey])

                    contains, planDict = self.checkJoins(
                        joins, allAttributes, cachedLHS, cachedRHS)

                    # print('Got to Contains')

                    if contains:

                        # print('PlanDict: ', planDict)

                        for joinMethod in [
                                "hash", "nested-loops", "block-nested-loops"
                        ]:

                            if joinMethod == "hash":
                                # print('HashMethod')
                                lhsPlan = cachedLHS
                                rhsPlan = cachedRHS
                                lhsHashFn = planDict['lhsHashFn']
                                rhsHashFn = planDict['rhsHashFn']
                                lhsKeySchema = planDict['lhsKeySchema']
                                rhsKeySchema = planDict['rhsKeySchema']

                                tryPlan = Plan(
                                    root=Join(method=joinMethod,
                                              lhsPlan=cachedLHS,
                                              lhsHashFn=lhsHashFn,
                                              lhsKeySchema=lhsKeySchema,
                                              rhsPlan=cachedRHS,
                                              rhsHashFn=rhsHashFn,
                                              rhsKeySchema=rhsKeySchema))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print('HashCost: ', cost)

                            else:
                                # print(joinMethod)
                                joinExpr = planDict['joinExpr']
                                tryPlan = Plan(root=Join(lhsPlan=cachedLHS,
                                                         rhsPlan=cachedRHS,
                                                         method=joinMethod,
                                                         expr=joinExpr))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print(joinMethod + ' Cost: ', cost)

                            if cost < bestCost:
                                bestCost = cost
                                bestPlan = tryPlan

                newKey = frozenset(joinOrdering)
                self.addPlanCost(newKey, bestCost, bestPlan)
                optimalSubPlans[
                    newKey] = bestPlan.root if bestPlan is not None else None

            numTables += 1

        nubPlan.subPlan = self.statsCache[frozenset(tableIDs)][1].root

        plan.prepare(self.db)

        return plan
示例#6
0
  def pickJoinOrder(self, plan):
    relations = plan.relations()
    fieldDict = self.obtainFieldDict(plan)
    (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
    # makes dicts that maps a list of relations to exprs involving that list
    # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
    # and on top of it the select exprs that involve 2 tables A,C or B,C

    isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
    outputSchema = plan.schema() 
    self.reportPlanCount = 0

    worklist = []
    for r in relations:
      table = TableScan(r,self.db.relationSchema(r))
      table.prepare(self.db)
      if (r,) in selectTablesDict: 
        selectExprs = selectTablesDict[(r,)]
        selectString = self.combineSelects(selectExprs)
        select = Select(table,selectString)
        select.prepare(self.db)
        worklist.append(Plan(root=select))
      else:
        worklist.append(Plan(root=table))

    while(len(worklist) > 1):
      combos = itertools.combinations(worklist,2)
      bestJoin = None
      sourcePair = None

      for pair in combos:
        op1 = pair[0].root
        op2 = pair[1].root

        selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict)
        joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict)
        
        join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops" )
        join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops" )


        join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops" )
        join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops" )

        if selectExpr == "True":
          full1BnljOp = join1BnljOp
          full2BnljOp = join2BnljOp
          
          full1NljOp = join1NljOp
          full2NljOp = join2NljOp

        else:
          full1BnljOp = Select(join1BnljOp, selectExpr)
          full2BnljOp = Select(join2BnljOp, selectExpr)
          
          full1NljOp = Select(join1NljOp, selectExpr)
          full2NljOp = Select(join2NljOp, selectExpr)
        

        joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp]

        for j in joinList:
          joinplan = Plan(root=j)
          joinplan.prepare(self.db)
          joinplan.sample(100)

          if bestJoin == None or joinplan.cost(True) < bestJoin.cost(True):
            bestJoin = joinplan
            sourcePair = pair

        self.reportPlanCount += 4
        self.clearSampleFiles()



      worklist.remove(sourcePair[0])
      worklist.remove(sourcePair[1])
      worklist.append(bestJoin)

    # after System R algorithm
    newPlan = worklist[0]

    if isGroupBy:
      newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
        aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
        aggExprs=plan.root.aggExprs, \
        groupHashFn=plan.root.groupHashFn)
      newGroupBy.prepare(self.db)
      newPlan = Plan(root=newGroupBy)

    if set(outputSchema.schema()) != set(newPlan.schema().schema()):
      projectDict = {}

      for f, t in outputSchema.schema():
        projectDict[f] = (f, t) 
      
      currRoot = newPlan.root
      project = Project(currRoot, projectDict)
      project.prepare(self.db)
      newPlan = Plan(root=project)
  
    return newPlan
示例#7
0
  def pickJoinOrder(self, plan):
    
    relations = plan.relations()
    fieldDict = self.obtainFieldDict(plan)
    

    (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
    # makes dicts that maps a list of relations to exprs involving that list
    # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
    # and on top of it the select exprs that involve 2 tables A,C or B,C

    isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
    outputSchema = plan.schema() 
    optDict = {}
    self.reportPlanCount = 0

    for npass in range(1, len(relations) + 1):
      if npass == 1:
        for r in relations:
          table = TableScan(r,self.db.relationSchema(r))
          if (r,) in selectTablesDict: 
            selectExprs = selectTablesDict[(r,)]
            selectString = self.combineSelects(selectExprs)
            select = Select(table,selectString)
            optDict[(r,)] = Plan(root=select)
          else:
            optDict[(r,)] = Plan(root=table)
          self.reportPlanCount += 1
      else:
        combinations = itertools.combinations(relations,npass)
        for c in combinations:
          fullList = sorted(c)
          clist = self.getCombos(fullList)
          bestJoin = None
          for subcombo in clist:
            complement = self.getComplement(fullList, subcombo)
            
            leftOps = optDict[tuple(complement)].root
            rightOps = optDict[tuple(subcombo)].root

            selectExpr = self.createExpression(complement, subcombo, selectTablesDict)
            joinExpr = self.createExpression(complement, subcombo, joinTablesDict)
            
            joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops" )
            fullBnljOp = Select(joinBnljOp, selectExpr)

            if selectExpr == "True":
              joinBnlj = Plan(root=joinBnljOp)
            else:
              joinBnlj = Plan(root=fullBnljOp)
            
            joinBnlj.prepare(self.db)
            joinBnlj.sample(100)
            
            joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops" )
            fullNljOp = Select(joinNljOp, selectExpr)

            if selectExpr == "True":
              joinNlj = Plan(root=joinNljOp)
            else:
              joinNlj = Plan(root=fullNljOp)
            
            joinNlj.prepare(self.db)
            joinNlj.sample(100)

            if joinBnlj.cost(True) < joinNlj.cost(True):
              if bestJoin == None or joinBnlj.cost(True) < bestJoin.cost(True):
                bestJoin = joinBnlj
            else:
              if bestJoin == None or joinNlj.cost(True) < bestJoin.cost(True):
                bestJoin = joinNlj

            self.reportPlanCount += 2
            self.clearSampleFiles()

          optDict[tuple(fullList)] = bestJoin
          
    # after System R algorithm
    newPlan = optDict[tuple(sorted(relations))]

    if isGroupBy:
      newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
        aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
        aggExprs=plan.root.aggExprs, \
        groupHashFn=plan.root.groupHashFn)
      newGroupBy.prepare(self.db)
      newPlan = Plan(root=newGroupBy)

    if set(outputSchema.schema()) != set(newPlan.schema().schema()):
      projectDict = {}

      for f, t in outputSchema.schema():
        projectDict[f] = (f, t) 
      
      currRoot = newPlan.root
      project = Project(currRoot, projectDict)
      project.prepare(self.db)
      newPlan = Plan(root=project)
  
    return newPlan
示例#8
0
  def pickJoinOrder(self, plan):
    self.numPlansConsidered = 0

    tableIds = list()
    joinOps  = list()
    optPlans = dict()
    fields   = dict()

    self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields)

    # Create worklist consisting of table IDs
    worklist = [str(tableId) for tableId in tableIds]

    # Now work our way down the 'worklist' greedily
    numTables = len(worklist)
    while numTables >= 2:
      # Choose the cheapest join that can be made over the remaining sub-plans
      minCost  = None
      optPlan  = None
      optLhsId = None
      optRhsId = None

      possibleJoinOrders = itertools.combinations(worklist, 2)
      for possibleJoinOrder in possibleJoinOrders:
        # Start examining each possible plan
        lhsId = possibleJoinOrder[0]
        rhsId = possibleJoinOrder[1]

        lhsOpt = optPlans[lhsId] if lhsId in optPlans else None
        rhsOpt = optPlans[rhsId] if rhsId in optPlans else None

        if lhsOpt is None or rhsOpt is None:
          continue # Skip irrelevant joins

        # This is to take care of multi-way joins added to worklist
        lhsIds = lhsId.split(",")
        rhsIds = rhsId.split(",")

        # Form a list of available attributes of this join
        allAttrs = list()
        for lId in lhsIds:
          allAttrs.extend(fields[int(lId)])

        for rId in rhsIds:
          allAttrs.extend(fields[int(rId)])

        currJoinExpr = None

        # Check whether any join expression can be satisfied with this join
        for join in joinOps:   
          if join.joinExpr:
            joinAttrs = ExpressionInfo(join.joinExpr).getAttributes()
            if self.contains(allAttrs, joinAttrs):
              currJoinExpr = join.joinExpr
              break

        if currJoinExpr is None:
          continue # Skip irrelevant joins

        self.numPlansConsidered += 2
            
        # Compare costs of different type of joins 
        for joinMethod in ["nested-loops", "block-nested-loops"]:
          possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr))

          possiblePlan.prepare(self.db)
          possiblePlan.sample(1.0) # Sampling causes too much overhead!
          cost = self.getPlanCost(plan)
          cost = possiblePlan.cost(estimated=True) if cost is None else cost
          self.addPlanCost(plan, cost)

          if minCost is None or cost < minCost:
            minCost  = cost
            optPlan  = possiblePlan
            optLhsId = lhsId
            optRhsId = rhsId

        # Switch left and right and compare again
        for joinMethod in ["nested-loops", "block-nested-loops"]:
          possiblePlan = Plan(root=Join(lhsPlan=rhsOpt, rhsPlan=lhsOpt, method=joinMethod, expr=currJoinExpr))

          possiblePlan.prepare(self.db)
          possiblePlan.sample(1.0) # Sampling causes too much overhead!
          cost = self.getPlanCost(plan)
          cost = possiblePlan.cost(estimated=True) if cost is None else cost
          self.addPlanCost(plan, cost)

          if minCost is None or cost < minCost:
            minCost  = cost
            optPlan  = possiblePlan
            optLhsId = rhsId
            optRhsId = lhsId

      if optPlan is not None:
        # Update optimal plan
        joinKey = optLhsId + "," + optRhsId
        optPlans[joinKey] = optPlan.root

        # Update worklist
        worklist.remove(optLhsId)
        worklist.remove(optRhsId)
        worklist.append(joinKey)

      numTables = numTables - 1

    # Return single plan left in worklist
    return optPlans[worklist[0]]
示例#9
0
    def pickJoinOrder(self, plan):
        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)
        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        self.reportPlanCount = 0

        worklist = []
        for r in relations:
            table = TableScan(r, self.db.relationSchema(r))
            table.prepare(self.db)
            if (r, ) in selectTablesDict:
                selectExprs = selectTablesDict[(r, )]
                selectString = self.combineSelects(selectExprs)
                select = Select(table, selectString)
                select.prepare(self.db)
                worklist.append(Plan(root=select))
            else:
                worklist.append(Plan(root=table))

        while (len(worklist) > 1):
            combos = itertools.combinations(worklist, 2)
            bestJoin = None
            sourcePair = None

            for pair in combos:
                op1 = pair[0].root
                op2 = pair[1].root

                selectExpr = self.createExpression(pair[0].relations(),
                                                   pair[1].relations(),
                                                   selectTablesDict)
                joinExpr = self.createExpression(pair[0].relations(),
                                                 pair[1].relations(),
                                                 joinTablesDict)

                join1BnljOp = Join(op1,
                                   op2,
                                   expr=joinExpr,
                                   method="block-nested-loops")
                join2BnljOp = Join(op2,
                                   op1,
                                   expr=joinExpr,
                                   method="block-nested-loops")

                join1NljOp = Join(op1,
                                  op2,
                                  expr=joinExpr,
                                  method="nested-loops")
                join2NljOp = Join(op2,
                                  op1,
                                  expr=joinExpr,
                                  method="nested-loops")

                if selectExpr == "True":
                    full1BnljOp = join1BnljOp
                    full2BnljOp = join2BnljOp

                    full1NljOp = join1NljOp
                    full2NljOp = join2NljOp

                else:
                    full1BnljOp = Select(join1BnljOp, selectExpr)
                    full2BnljOp = Select(join2BnljOp, selectExpr)

                    full1NljOp = Select(join1NljOp, selectExpr)
                    full2NljOp = Select(join2NljOp, selectExpr)

                joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp]

                for j in joinList:
                    joinplan = Plan(root=j)
                    joinplan.prepare(self.db)
                    joinplan.sample(100)

                    if bestJoin == None or joinplan.cost(True) < bestJoin.cost(
                            True):
                        bestJoin = joinplan
                        sourcePair = pair

                self.reportPlanCount += 4
                self.clearSampleFiles()

            worklist.remove(sourcePair[0])
            worklist.remove(sourcePair[1])
            worklist.append(bestJoin)

        # after System R algorithm
        newPlan = worklist[0]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
示例#10
0
    def pickJoinOrder(self, plan):

        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)

        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        optDict = {}
        self.reportPlanCount = 0

        for npass in range(1, len(relations) + 1):
            if npass == 1:
                for r in relations:
                    table = TableScan(r, self.db.relationSchema(r))
                    if (r, ) in selectTablesDict:
                        selectExprs = selectTablesDict[(r, )]
                        selectString = self.combineSelects(selectExprs)
                        select = Select(table, selectString)
                        optDict[(r, )] = Plan(root=select)
                    else:
                        optDict[(r, )] = Plan(root=table)
                    self.reportPlanCount += 1
            else:
                combinations = itertools.combinations(relations, npass)
                for c in combinations:
                    fullList = sorted(c)
                    clist = self.getCombos(fullList)
                    bestJoin = None
                    for subcombo in clist:
                        complement = self.getComplement(fullList, subcombo)

                        leftOps = optDict[tuple(complement)].root
                        rightOps = optDict[tuple(subcombo)].root

                        selectExpr = self.createExpression(
                            complement, subcombo, selectTablesDict)
                        joinExpr = self.createExpression(
                            complement, subcombo, joinTablesDict)

                        joinBnljOp = Join(leftOps,
                                          rightOps,
                                          expr=joinExpr,
                                          method="block-nested-loops")
                        fullBnljOp = Select(joinBnljOp, selectExpr)

                        if selectExpr == "True":
                            joinBnlj = Plan(root=joinBnljOp)
                        else:
                            joinBnlj = Plan(root=fullBnljOp)

                        joinBnlj.prepare(self.db)
                        joinBnlj.sample(100)

                        joinNljOp = Join(leftOps,
                                         rightOps,
                                         expr=joinExpr,
                                         method="nested-loops")
                        fullNljOp = Select(joinNljOp, selectExpr)

                        if selectExpr == "True":
                            joinNlj = Plan(root=joinNljOp)
                        else:
                            joinNlj = Plan(root=fullNljOp)

                        joinNlj.prepare(self.db)
                        joinNlj.sample(100)

                        if joinBnlj.cost(True) < joinNlj.cost(True):
                            if bestJoin == None or joinBnlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinBnlj
                        else:
                            if bestJoin == None or joinNlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinNlj

                        self.reportPlanCount += 2
                        self.clearSampleFiles()

                    optDict[tuple(fullList)] = bestJoin

        # after System R algorithm
        newPlan = optDict[tuple(sorted(relations))]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
示例#11
0
  def pickJoinOrder(self, plan):
    # Some restrictions apply:
    # 1. Cannot involve hash-joins or index-joins
    # 2. Only join operations beyond certain point (i.e. cannot have join -> aggregation -> join, etc.)

    tableIds = list()
    joinOps  = list()
    optPlans = dict()
    fields   = dict()

    firstOpWithJoins = self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields)

    if len(joinOps) == 0:
      return plan
    
    numTables = 2
    while numTables <= len(tableIds):
      possibleJoinOrders = itertools.combinations(tableIds, numTables)
      for possibleJoinOrder in possibleJoinOrders:
        minCost = None
        optPlan = None
        for tableId in possibleJoinOrder:
          # Left-deep-only optimizer (i.e. rhs operand is a base relation)
          lhsIds = list(possibleJoinOrder)
          lhsIds.remove(tableId)

          lhsJoinKey = self.getJoinKey(lhsIds)
          rhsJoinKey = str(tableId)

          lhsOpt = optPlans[lhsJoinKey] if lhsJoinKey in optPlans else None
          rhsOpt = optPlans[rhsJoinKey] if rhsJoinKey in optPlans else None

          if lhsOpt is None or rhsOpt is None:
            continue # Skip irrelevant joins

          # Form a list of available attributes of this join
          allAttrs = list()
          for lhsId in lhsIds:
            allAttrs.extend(fields[lhsId])

          allAttrs.extend(fields[tableId])

          currJoinExpr = None

          # Check whether any join expression can be satisfied with this join
          for join in joinOps:      
            if join.joinExpr:
              joinAttrs = ExpressionInfo(join.joinExpr).getAttributes()
              if self.contains(allAttrs, joinAttrs):
                currJoinExpr = join.joinExpr
                break
            else:
              # Should not involve hash-joins or index-joins (limitation)
              return plan

          if currJoinExpr is None:
            continue # Irrelevant join

          for joinMethod in ["nested-loops", "block-nested-loops"]:
            possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr))

            possiblePlan.prepare(self.db)
            possiblePlan.sample(1.0) # Sampling causes too much overhead!
            cost = self.getPlanCost(plan)
            cost = possiblePlan.cost(estimated=True) if cost is None else cost
            self.addPlanCost(plan, cost)

            if minCost is None or cost < minCost:
              minCost = cost
              optPlan = possiblePlan

        optPlans[self.getJoinKey(possibleJoinOrder)] = None if optPlan is None else optPlan.root 

      numTables = numTables + 1

    firstOpWithJoins.subPlan = optPlans[self.getJoinKey(tableIds)]

    plan.prepare(self.db)

    return plan