Exemplo n.º 1
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 50
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        prev = dict()
        curr = dict()
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))
            prev[aPath] = (numPages, 0)
        # i = 2...n
        for i in range(1, n):
            # build current list with prev.
            # For 2-way joins, we don't need to care left deep plan
            for p in prev.keys():
                accP = self.allAccessPaths(p)
                remL = [item for item in aPaths if item not in accP]
                for base in remL:
                    lhsSchema = p.schema()
                    rhsSchema = base.schema()
                    newJoin = None
                    (sCostL, tCostL) = prev[p]
                    (rPlan, costR) = self.getPlanCost(base)
                    # Here we are using System-R 's heuristic to eliminate permutations as
                    # much as possible.
                    # Reference: Selinger, 1979, http://www.cs.berkeley.edu/~brewer/cs262/3-selinger79.pdf
                    for (lField, rField) in joinExprs:
                        if lField in lhsSchema.fields and rField in rhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                        elif lField in rhsSchema.fields and rField in lhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=rKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=lKeySchema)
                        else:
                            continue

                        if newJoin is not None:
                            # Let's push newJoin onto the cache and curr list
                            # cost: 3(M+N) + M's totalcost
                            # then we renew newJoin's stepcost
                            newJoin.prepare(self.db)
                            stepCost = 3 * (sCostL + costR[0])
                            totalCost = stepCost + tCostL
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            self.addPlanCost(newJoin, (pages, totalCost))
                            curr[newJoin] = (pages, totalCost)

            prev = curr
            curr = dict()

        del prev, curr

        return self.getPlanCost(operator)[0]
Exemplo n.º 2
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5

        n = len(aPaths)
        planList = []
        costList = []
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' total cost as 0.
            planList.append(aPath)
            costList.append((numPages, 0))
        # i = 2...n
        for i in range(1, n):
            # find all possible two way join in current planList
            # put the potential joins in potentialP
            # put the potential joins cost in potentialC
            m = len(planList)
            potentialP = []
            potentialC = []
            for j in range(0, m - 1):
                for k in range(j + 1, m):
                    self.pcntr += 1
                    potentialP.append((planList[j], planList[k]))
                    potentialC.append(3 * (costList[j][0] + costList[k][0]) +
                                      costList[j][1] + costList[k][1])
            # find the cheapest joinable join (total cost)
            # build the join, remove the used two base plan and add the new join to planList
            # modify the costList as well
            while (potentialC):
                currC = min(potentialC)
                currP = potentialP[potentialC.index(currC)]
                potentialC.remove(currC)
                potentialP.remove(currP)
                if (self.joinable(operator, currP)):
                    (lField, rField) = self.joinable(operator, currP)
                    lhsSchema = currP[0].schema()
                    rhsSchema = currP[1].schema()
                    lKeySchema = DBSchema(
                        'left',
                        [(f, t)
                         for (f, t) in lhsSchema.schema() if f == lField])
                    rKeySchema = DBSchema(
                        'right',
                        [(f, t)
                         for (f, t) in rhsSchema.schema() if f == rField])
                    lHashFn = 'hash(' + lField + ') % ' + str(
                        defaultPartiNumber)
                    rHashFn = 'hash(' + rField + ') % ' + str(
                        defaultPartiNumber)
                    newJoin = Join(currP[0], currP[1], method='hash', \
                                   lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                   rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                    newJoin.prepare(self.db)
                    totalCost = currC
                    cards = Plan(root=newJoin).sample(defaultScaleFactor)
                    pageSize, _, _ = self.db.storage.relationStats(
                        newJoin.relationId())
                    pages = cards / (pageSize / newJoin.schema().size)

                    id1 = planList.index(currP[0])
                    _ = planList.pop(id1)
                    id2 = planList.index(currP[1])
                    _ = planList.pop(id2)
                    planList.append(newJoin)
                    _ = costList.pop(id1)
                    _ = costList.pop(id2)
                    costList.append((pages, totalCost))
                    break
        print("GreedyOptimizer plan considered: ", self.pcntr)
        return planList[0]
Exemplo n.º 3
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))

        for i in range(1, n):
            for S in comb(aPaths, i + 1):
                for O in self.powerSet(S):

                    # The following codes are added because some subPlans may
                    # not be present in the self.statsCache as
                    # 1) it was filtered out because it is a right-deep
                    # 2) it has not any constraint associated.
                    keyL = tuple(sorted(list(map(lambda x: x.id(), O))))
                    keyR = tuple(
                        sorted(
                            list(
                                map(lambda x: x.id(),
                                    [ele for ele in S if ele not in O]))))

                    planForO = None
                    remindPl = None
                    costL = None
                    costR = None

                    if keyL in self.statsCache and keyR in self.statsCache:
                        (planForO, costL) = self.statsCache[tuple(
                            sorted(list(map(lambda x: x.id(), O))))]
                        (remindPl, costR) = self.statsCache[tuple(
                            sorted(
                                list(
                                    map(lambda x: x.id(),
                                        [ele for ele in S if ele not in O]))))]
                    else:
                        continue

                    fields = self.joinable(joinExprs, [planForO, remindPl])
                    # If we detect constraints, we will create a new join from here.
                    if fields is not None:
                        lKeySchema = DBSchema(
                            'left', [(f, t)
                                     for (f, t) in planForO.schema().schema()
                                     if f == fields[0]])
                        rKeySchema = DBSchema(
                            'right', [(f, t)
                                      for (f, t) in remindPl.schema().schema()
                                      if f == fields[1]])
                        lHashFn = 'hash(' + fields[0] + ') % ' + str(
                            defaultPartiNumber)
                        rHashFn = 'hash(' + fields[1] + ') % ' + str(
                            defaultPartiNumber)
                        newJoin = Join(planForO, remindPl, method='hash', \
                                       lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                       rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)
                        if (i == 1) or (not self.isRightDeep(newJoin, aPaths)):
                            newJoin.prepare(self.db)
                            # Calculate output pages;
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            # Calculate output costs:
                            totalCost = costL[1] + costR[1] + 3 * (costL[0] +
                                                                   costR[0])
                            # Add new Join to self.statsCache
                            self.addPlanCost(newJoin, (pages, totalCost))

        return self.getPlanCost(operator)[0]