Exemplo n.º 1
0
 def joinSubPred(self, sch1, sch2):
     result = Predicate()
     newsch = Schema()
     newsch.addAll(sch1)
     newsch.addAll(sch2)
     for t in self.terms:
         if not t.appliesTo(sch1) and not t.appliesTo(sch2) and t.appliesTo(
                 newsch):
             result.terms.append(t)
     if len(result.terms) == 0:
         return None
     else:
         return result
Exemplo n.º 2
0
class ProductPlan(Plan):

    #
    #     * Creates a new product node in the query tree,
    #     * having the two specified subqueries.
    #     * @param p1 the left-hand subquery
    #     * @param p2 the right-hand subquery
    #
    def __init__(self, p1, p2):
        super(ProductPlan, self).__init__()
        self.p1 = p1
        self.p2 = p2
        self._schema = Schema()
        self._schema.addAll(p1.schema())
        self._schema.addAll(p2.schema())

    #
    #     * Creates a product scan for this query.
    #     * @see Plan#open()
    #
    def open(self):
        s1 = self.p1.open()
        s2 = self.p2.open()
        return ProductScan(s1, s2)

    #
    #     * Estimates the number of block accesses in the product.
    #     * The formula is:
    #     * <pre> B(product(p1,p2)) = B(p1) + R(p1)*B(p2) </pre>
    #     * @see Plan#blocksAccessed()
    #
    def blocksAccessed(self):
        return self.p1.blocksAccessed() + (self.p1.recordsOutput() *
                                           self.p2.blocksAccessed())

    #
    #     * Estimates the number of output records in the product.
    #     * The formula is:
    #     * <pre> R(product(p1,p2)) = R(p1)*R(p2) </pre>
    #     * @see Plan#recordsOutput()
    #
    def recordsOutput(self):
        return self.p1.recordsOutput() * self.p2.recordsOutput()

    #
    #     * Estimates the distinct number of field values in the product.
    #     * Since the product does not increase or decrease field values,
    #     * the estimate is the same as in the appropriate underlying query.
    #     * @see Plan#distinctValues(String)
    #
    def distinctValues(self, fldname):
        if self.p1.schema().hasField(fldname):
            return self.p1.distinctValues(fldname)
        else:
            return self.p2.distinctValues(fldname)

    #
    #     * Returns the schema of the product,
    #     * which is the union of the schemas of the underlying queries.
    #     * @see Plan#schema()
    #
    def schema(self):
        return self._schema
class MultibufferProductPlan(Plan):

    #
    #     * Creates a product plan for the specified queries.
    #     * @param lhs the plan for the LHS query
    #     * @param rhs the plan for the RHS query
    #     * @param tx the calling transaction
    #
    def __init__(self, tx, lhs, rhs):
        super(MultibufferProductPlan, self).__init__()
        self.tx = tx
        self.lhs = MaterializePlan(tx, lhs)
        self.rhs = rhs

        self.schema = Schema()
        self.schema.addAll(lhs.schema())
        self.schema.addAll(rhs.schema())

    #
    #     * A scan for this query is created and returned, as follows.
    #     * First, the method materializes its LHS and RHS queries.
    #     * It then determines the optimal chunk size,
    #     * based on the size of the materialized RHS file and the
    #     * number of available buffers.
    #     * It creates a chunk plan for each chunk, saving them in a list.
    #     * Finally, it creates a multiscan for this list of plans,
    #     * and returns that scan.
    #     * @see Plan#open()
    #
    def open(self):
        leftscan = self.lhs.open()
        tt = self.copyRecordsFrom(self.rhs)
        return MultibufferProductScan(self.tx, leftscan, tt.tableName(),
                                      tt.getLayout())

    #
    #     * Returns an estimate of the number of block accesses
    #     * required to execute the query. The formula is:
    #     * <pre> B(product(p1,p2)) = B(p2) + B(p1)*C(p2) </pre>
    #     * where C(p2) is the number of chunks of p2.
    #     * The method uses the current number of available buffers
    #     * to calculate C(p2), and so this value may differ
    #     * when the query scan is opened.
    #     * @see Plan#blocksAccessed()
    #
    def blocksAccessed(self):
        #  this guesses at the # of chunks
        avail = self.tx.availableBuffs()
        size = MaterializePlan(self.tx, self.rhs).blocksAccessed()
        numchunks = int(size / avail)
        return self.rhs.blocksAccessed() + (self.lhs.blocksAccessed() *
                                            numchunks)

    #
    #     * Estimates the number of output records in the product.
    #     * The formula is:
    #     * <pre> R(product(p1,p2)) = R(p1)*R(p2) </pre>
    #     * @see Plan#recordsOutput()
    #
    def recordsOutput(self):
        return self.lhs.recordsOutput() * self.rhs.recordsOutput()

    #
    #     * Estimates the distinct number of field values in the product.
    #     * Since the product does not increase or decrease field values,
    #     * the estimate is the same as in the appropriate underlying query.
    #     * @see Plan#distinctValues(String)
    #
    def distinctValues(self, fldname):
        if self.lhs.schema().hasField(fldname):
            return self.lhs.distinctValues(fldname)
        else:
            return self.rhs.distinctValues(fldname)

    #
    #     * Returns the schema of the product,
    #     * which is the union of the schemas of the underlying queries.
    #     * @see Plan#schema()
    #
    def schema(self):
        return self.schema

    def copyRecordsFrom(self, p):
        src = p.open()
        sch = p.schema()
        t = TempTable(self.tx, sch)
        dest = t.open()
        while src.next():
            dest.insert()
            for fldname in sch.fields():
                dest.setVal(fldname, src.getVal(fldname))
        src.close()
        dest.close()
        return t
Exemplo n.º 4
0
class MergeJoinPlan(Plan):

    #
    #     * Creates a mergejoin plan for the two specified queries.
    #     * The RHS must be materialized after it is sorted,
    #     * in order to deal with possible duplicates.
    #     * @param p1 the LHS query plan
    #     * @param p2 the RHS query plan
    #     * @param fldname1 the LHS join field
    #     * @param fldname2 the RHS join field
    #     * @param tx the calling transaction
    #
    def __init__(self, tx, p1, p2, fldname1, fldname2):
        super(MergeJoinPlan, self).__init__()
        self.fldname1 = fldname1
        sortlist1 = [fldname1]
        self.p1 = SortPlan(tx, p1, sortlist1)

        self.fldname2 = fldname2
        sortlist2 = [fldname2]
        self.p2 = SortPlan(tx, p2, sortlist2)

        self.sch = Schema()
        self.sch.addAll(p1.schema())
        self.sch.addAll(p2.schema())

    #  The method first sorts its two underlying scans
    #      * on their join field. It then returns a mergejoin scan
    #      * of the two sorted table scans.
    #      * @see Plan#open()
    #
    def open(self):
        s1 = self.p1.open()
        s2 = self.p2.open()
        return MergeJoinScan(s1, s2, self.fldname1, self.fldname2)

    #
    #     * Return the number of block acceses required to
    #     * mergejoin the sorted tables.
    #     * Since a mergejoin can be preformed with a single
    #     * pass through each table, the method returns
    #     * the sum of the block accesses of the
    #     * materialized sorted tables.
    #     * It does <i>not</i> include the one-time cost
    #     * of materializing and sorting the records.
    #     * @see Plan#blocksAccessed()
    #
    def blocksAccessed(self):
        return self.p1.blocksAccessed() + self.p2.blocksAccessed()

    #
    #     * Return the number of records in the join.
    #     * Assuming uniform distribution, the formula is:
    #     * <pre> R(join(p1,p2)) = R(p1)*R(p2)/max{V(p1,F1),V(p2,F2)}</pre>
    #     * @see Plan#recordsOutput()
    #
    def recordsOutput(self):
        maxvals = max(self.p1.distinctValues(self.fldname1),
                      self.p2.distinctValues(self.fldname2))
        return int(
            (self.p1.recordsOutput() * self.p2.recordsOutput()) / maxvals)

    #
    #     * Estimate the distinct number of field values in the join.
    #     * Since the join does not increase or decrease field values,
    #     * the estimate is the same as in the appropriate underlying query.
    #     * @see Plan#distinctValues(String)
    #
    def distinctValues(self, fldname):
        if self.p1.schema().hasField(fldname):
            return self.p1.distinctValues(fldname)
        else:
            return self.p2.distinctValues(fldname)

    #
    #     * Return the schema of the join,
    #     * which is the union of the schemas of the underlying queries.
    #     * @see Plan#schema()
    #
    def schema(self):
        return self.sch
Exemplo n.º 5
0
class IndexJoinPlan(Plan):

    #
    #     * Implements the join operator,
    #     * using the specified LHS and RHS plans.
    #     * @param p1 the left-hand plan
    #     * @param p2 the right-hand plan
    #     * @param ii information about the right-hand index
    #     * @param joinfield the left-hand field used for joining
    #
    def __init__(self, p1, p2, ii, joinfield):
        super(IndexJoinPlan, self).__init__()
        self.p1 = p1
        self.p2 = p2
        self.ii = ii
        self.joinfield = joinfield
        self.sch = Schema()
        self.sch.addAll(p1.schema())
        self.sch.addAll(p2.schema())

    #
    #     * Opens an indexjoin scan for this query
    #     * @see Plan#open()
    #
    def open(self):
        s = self.p1.open()
        #  throws an exception if p2 is not a tableplan
        ts = self.p2.open()
        idx = self.ii.open()
        return IndexJoinScan(s, idx, self.joinfield, ts)

    #
    #     * Estimates the number of block accesses to compute the join.
    #     * The formula is:
    #     * <pre> B(indexjoin(p1,p2,idx)) = B(p1) + R(p1)*B(idx)
    #     *       + R(indexjoin(p1,p2,idx) </pre>
    #     * @see Plan#blocksAccessed()
    #
    def blocksAccessed(self):
        return self.p1.blocksAccessed() + (self.p1.recordsOutput() * self.ii.blocksAccessed()) + self.recordsOutput()

    #
    #     * Estimates the number of output records in the join.
    #     * The formula is:
    #     * <pre> R(indexjoin(p1,p2,idx)) = R(p1)*R(idx) </pre>
    #     * @see Plan#recordsOutput()
    #
    def recordsOutput(self):
        return self.p1.recordsOutput() * self.ii.recordsOutput()

    #
    #     * Estimates the number of distinct values for the
    #     * specified field.
    #     * @see Plan#distinctValues(String)
    #
    def distinctValues(self, fldname):
        if self.p1.schema().hasField(fldname):
            return self.p1.distinctValues(fldname)
        else:
            return self.p2.distinctValues(fldname)

    #
    #     * Returns the schema of the index join.
    #     * @see Plan#schema()
    #
    def schema(self):
        return self.sch