Пример #1
0
    def _apply_index(self, request):
        record = parseIndexRequest(request, self.id)
        try:
            qstart, qend = record.keys
        except TypeError:
            return None

        minint = BTrees.family64.minint
        maxint = BTrees.family64.maxint

        qstart = min(maxint, max(minint, qstart))
        qend = max(minint, min(maxint, qend))

        # start in inside range
        start = multiunion(self._since_index.values(max=qstart))
        end = multiunion(self._until_index.values(min=qstart))
        start_into = intersection(start, end)

        # end inside range
        start = multiunion(self._since_index.values(max=qend))
        end = multiunion(self._until_index.values(min=qend))
        end_into = intersection(start, end)

        # start before range and end after range
        start = multiunion(self._since_index.values(min=qstart))
        end = multiunion(self._until_index.values(max=qend))
        start_before_end_after = intersection(start, end)

        result = union(start_into, end_into)
        result = union(result, start_before_end_after)

        return multiunion(map(self._index.__getitem__, result)), (self.id,)
Пример #2
0
    def search(self,path,default_level=0):
        """
        path is either a string representing a
        relative URL or a part of a relative URL or
        a tuple (path,level).

        level>=0  starts searching at the given level
        level<0   not implemented yet
        """

        if isinstance(path,StringType):
            level = default_level
        else:
            level = int(path[1])
            path  = path[0]

        comps = self.splitPath(path)

        if len(comps) == 0:
            return IISet(self._unindex.keys())

        if level >=0:

            results = []
            for i in range(len(comps)):
                comp = comps[i]

                if not self._index.has_key(comp): return IISet()
                if not self._index[comp].has_key(level+i): return IISet()

                results.append( self._index[comp][level+i] )
            
            res = results[0]

            for i in range(1,len(results)):
                res = intersection(res,results[i])

            return res

        else:

            results = IISet()

            for level in range(0,self._depth + 1):

                ids = None
                error = 0

                for cn in range(0,len(comps)):
                    comp = comps[cn]

                    try:
                        ids = intersection(ids,self._index[comp][level+cn])
                    except KeyError:
                        error = 1

                if error==0:
                    results = union(results,ids)

            return results
    def timing(self, small, large, text=''):
        new = 0.0
        old = 0.0
        c = 0.0
        loop = LOOP
        for i in xrange(loop):
            start = time()
            intersection2(small, large)
            new+=(time()-start)

            start = time()
            intersection(small, large)
            old+=(time()-start)

            start = time()
            ciiintersection(small, large)
            c+=(time()-start)

        new_ratio = old / new
        c_ratio = old / c

        new_report = False
        if new_ratio <= 0.4 or new_ratio > 2:
            new_report = True
        c_report = False
        if c_ratio <= 0.8 or c_ratio > 1.2:
            c_report = True

        if c_report or new_report:
            print
            print text
            print 'Old x%s: %.6f' % (loop, old)
            print 'New x%s: %.6f - factor: %.2f' % (loop, new, new_ratio)
            print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, c_ratio)
Пример #4
0
    def _sort_iterate_index(self, actual_result_count, result, rs,
                            limit, merge, reverse,
                            sort_index, sort_index_length, sort_spec,
                            second_indexes_key_map):
        # The result set is much larger than the sorted index,
        # so iterate over the sorted index for speed.
        # TODO: len(sort_index) isn't actually what we want for a keyword
        # index, as it's only the unique values, not the documents.
        # Don't use this case while using limit, as we return results of
        # non-flattened intsets, and would have to merge/unflattened those
        # before limiting.
        length = 0
        try:
            intersection(rs, IISet(()))
        except TypeError:
            # rs is not an object in the IIBTree family.
            # Try to turn rs into an IISet.
            rs = IISet(rs)

        if sort_index_length == 1:
            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', None)
                    if keys is not None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    result.append((k, intset, self.__getitem__))
            result.sort(reverse=reverse)
        else:
            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', None)
                    if keys is not None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    # sort on secondary index
                    keysets = defaultdict(list)
                    for i in intset:
                        full_key = (k, )
                        for km in second_indexes_key_map:
                            try:
                                full_key += (km[i], )
                            except KeyError:
                                pass
                        keysets[full_key].append(i)
                    for k2, v2 in keysets.items():
                        result.append((k2, v2, self.__getitem__))
            result = multisort(result, sort_spec)

        return (actual_result_count, length, result)
    def train(self):
        """
        """
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
Пример #6
0
 def keywords_of_section(self, section, kwfilter):
     """Valid keywords under the given section.
     """
     pcat = getToolByName(section, 'portal_catalog')
     cat = pcat._catalog
     path_idx = cat.indexes[self.path_index]
     tags_idx = cat.indexes[self.keyword_index]
     result = []
     # query all oids of path - low level
     pquery = {
         self.path_index: {
             'query': '/'.join(section.getPhysicalPath()),
             'depth': -1,
         }
     }
     kwfilter = safe_encode(kwfilter)
     # uses internal zcatalog specific details to quickly get the values.
     path_result, info = path_idx._apply_index(pquery)
     for tag in tags_idx.uniqueValues():
         if kwfilter and kwfilter not in safe_encode(tag):
             continue
         tquery = {self.keyword_index: tag}
         tags_result, info = tags_idx._apply_index(tquery)
         if intersection(path_result, tags_result):
             result.append(tag)
     # result should be sorted, because uniqueValues are.
     return safe_simplevocabulary_from_values(result)
Пример #7
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = IITreeSet(old_wid2w.keys())
        new_widset = IITreeSet(new_wid2w.keys())

        in_both_widset = intersection(old_widset, new_widset)
        only_old_widset = difference(old_widset, in_both_widset)
        only_new_widset = difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = WidCode.encode(new_wids)
        return len(new_wids)
    def timing(self, small, large):
        new = 0.0
        old = 0.0
        new2 = 0.0
        c = 0.0
        loop = LOOP
        for i in xrange(loop):

            start = time()
            res = intersection2(small, large)
            new+=(time()-start)

            start = time()
            res = intersection(small, large)
            old+=(time()-start)

            if ciiintersection is not None:
                start = time()
                res = ciiintersection(small, large)
                c+=(time()-start)

        print 'Old x%s: %.6f' % (loop, old)
        print 'New x%s: %.6f' % (loop, new)
        if ciiintersection is not None:
            print 'Cyt x%s: %.6f' % (loop, c)
Пример #9
0
    def below(self, arg):
        """Find all resources at or below path, within the limits given.
        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        parts = path.split(os.sep)
        rids = None
        for level in range(len(parts)):
            rids = intersection(rids, self.parts[(level, parts[level])])
        if rids is None:
            return IISet() # short-cut


        # Limits
        # ======
        # Remove rids that are above any upper limit, and then only include rids
        # that are above any lower limit. Limits are relative to the level of
        # the requested path.

        if upper is not None:
            upper += level
            for i in range(level, upper):
                if i not in self.levels:
                    break
                rids = difference(rids, self.levels[i])
        if lower is not None:
            lower += level
            _rids = []
            for i in range(level, lower):
                if i not in self.levels:
                    break
                _rids.append(self.levels[i])
            rids = intersection(rids, multiunion(_rids))

        return rids
Пример #10
0
    def _apply_index( self, request, cid='' ):
        """
            Apply the index to query parameters given in 'request', which
            should be a mapping object.

            If the request does not contain the needed parametrs, then
            return None.

            If the request contains a parameter with the name of the
            column + "_usage", snif for information on how to handle
            applying the index.

            Otherwise return two objects.  The first object is a ResultSet
            containing the record numbers of the matching records.  The
            second object is a tuple containing the names of all data fields
            used.
        """
        record = parseIndexRequest( request, self.getId() )
        if record.keys is None:
            return None

        term        = self._convertDateTime( record.keys[0] )

        #
        #   Aggregate sets for each bucket separately, to avoid
        #   large-small union penalties.
        #
        #until_only  = IISet()
        #map( until_only.update, self._until_only.values( term ) )
        # XXX use multi-union
        until_only = multiunion( self._until_only.values( term ) )

        #since_only  = IISet()
        #map( since_only.update, self._since_only.values( None, term ) )
        # XXX use multi-union
        since_only = multiunion( self._since_only.values( None, term ) )

        #until       = IISet()
        #map( until.update, self._until.values( term ) )
        # XXX use multi-union
        until = multiunion( self._until.values( term ) )

        #since       = IISet()
        #map( since.update, self._since.values( None, term ) )
        # XXX use multi-union
        since = multiunion( self._since.values( None, term ) )

        bounded     = intersection( until, since )

        #   Merge from smallest to largest.
        #result      = union( self._always, until_only )
        result      = union( bounded, until_only )
        result      = union( result, since_only )
        #result      = union( result, bounded )
        result      = union( result, self._always )

        return result, ( self._since_field, self._until_field )
Пример #11
0
def intersectionResultSets(sets):
    """ perform intersection of ResultSets """

    if not sets:
        return ResultSet(DocidList(), WordList())

    docids = sets[0].getDocids()
    words = WordList(sets[0].getWords())

    for set in sets[1:]:
        docids = intersection(docids, set.docids)
        words.extend(set.words)
    return ResultSet(docids, words)
Пример #12
0
 def _eval(self,context):
   csq = self._classifySubqueries()
   if csq['empty']: return IISet() # empty result
   nsq = csq['lookup'] + csq['complex'] + csq['indexed']
   notsq = csq['notQ']
   if not nsq and not notsq:
     # an empty 'And' query
     return context._getObjectIds()
   if not nsq: nsq.append(notsq.pop())
   r = None
   for q in nsq: r = intersection(r, q._eval(context))
   for q in notsq: r = difference(r, q._query._eval(context))
   return r
Пример #13
0
 def count(self, context, facet, intersect=None):
     if IQueryResults.providedBy(intersect):
         intersect = IISet(intersect.keys())
     sm = sitemanager_for(context)
     unique_name = '%s.%s' % (facet.name, self.name)
     cache_tools = queryUtility(ISetCacheTools, context=sm)
     invalidated = cache_tools.invalidated_records
     if not isinstance(invalidated, IISet):
         invalidated = IISet(invalidated)
     if isinstance(intersect, IISet):
         invalid = len(intersection(intersect, invalidated)) > 0
     if unique_name in cache_tools.filter_setid_cache:
         setid = cache_tools.filter_setid_cache[unique_name]
         if setid in cache_tools.set_cache:
             if invalid:
                 del(cache_tools.set_cache[setid])
                 del(cache_tools.filter_setid_cache[unique_name])
             else:
                 records = cache_tools.set_cache[setid]
                 if intersect is None:
                     return len(records)
                 if isinstance(intersect, IISet):
                     #optimal to cast smaller set to match IISet.
                     return len(intersection(intersect, IISet(records)))
                 return len(set(intersect) & records)
     #otherwise, at this point, no cached value, so query catalog...
     qf = self(unique_name)
     runner = AdvancedQueryRunner(context)
     result = runner(qf)
     setid = result.setid
     cache_tools.set_cache[setid] = result.frozen
     cache_tools.filter_setid_cache[unique_name] = setid
     if intersect is None:
         return len(result)
     if isinstance(intersect, IISet):
         return len(intersection(intersect, IISet(result.frozen)))
     return len(set(intersect) & result.frozen)
Пример #14
0
    def query_index(self, record, resultset=None):
        index = self._index
        indexed = self._index_value

        for key in record.keys:
            if bool(key) is bool(indexed):
                # If we match the indexed value, check index
                return intersection(index, resultset)
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return union(difference(self._unindex, index), IISet([]))
                else:
                    return difference(resultset, index)
        return IISet()
    def test_depth_limit_resultset(self):
        self._populateIndex()
        resultset = IISet([1, 2, 3, 4, 8, 16])
        tests = [
            # depth, expected result
            (1, [1, 8, 16]),
            (2, [1, 2, 8, 16]),
            (3, [1, 2, 3, 8, 16]),
            ]

        for depth, results in tests:
            res = self._index._apply_index(dict(
                path=dict(query='/', depth=depth)), resultset=resultset)
            combined = intersection(res[0], resultset)
            lst = list(combined)
            self.assertEqual(lst, results)
Пример #16
0
    def _search(self, path, default_level=0):
        """ Perform the actual search.

        ``path``
            a string representing a relative URL, or a part of a relative URL,
            or a tuple ``(path, level)``.  In the first two cases, use
            ``default_level`` as the level for the search.

        ``default_level``
            the level to use for non-tuple queries.

        ``level >= 0`` =>  match ``path`` only at the given level.

        ``level <  0`` =>  match ``path`` at *any* level
        """
        if isinstance(path, str):
            level = default_level
        else:
            level = int(path[1])
            path = path[0]

        if level < 0:
            # Search at every level, return the union of all results
            return multiunion(
                [self._search(path, level)
                 for level in range(self._depth + 1)])

        comps = filter(None, path.split('/'))

        if level + len(comps) - 1 > self._depth:
            # Our search is for a path longer than anything in the index
            return IISet()

        if len(comps) == 0:
            return IISet(self._unindex.keys())

        results = None
        for i, comp in reversed(list(enumerate(comps))):
            tree = self._index.get(comp, None)
            if tree is None:
                return IISet()
            tree2 = tree.get(level + i, None)
            if tree2 is None:
                return IISet()
            results = intersection(results, tree2)
        return results
Пример #17
0
    def _apply_index(self, request, resultset=None):
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        indexed = self._index_value

        for key in record.keys:
            if bool(key) is bool(indexed):
                # If we match the indexed value, check index
                return (intersection(index, resultset), (self.id,))
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return (union(difference(self._unindex, index), IISet([])), (self.id,))
                else:
                    return (difference(resultset, index), (self.id,))
        return (IISet(), (self.id,))
Пример #18
0
    def _search_index(self, cr, index_id, query, rs):
        cr.start_split(index_id)

        index_rs = None
        index = self.getIndex(index_id)
        limit_result = ILimitedResultIndex.providedBy(index)

        if IQueryIndex.providedBy(index):
            index_query = IndexQuery(query, index.id, index.query_options,
                                     index.operators, index.useOperator)
            if index_query.keys is not None:
                index_rs = index.query_index(index_query, rs)
        else:
            if limit_result:
                index_result = index._apply_index(query, rs)
            else:
                index_result = index._apply_index(query)

            # Parse (resultset, used_attributes) index return value.
            if index_result:
                index_rs, _ = index_result

        if not index_rs:
            # Short circuit if empty index result.
            rs = None
        else:
            # Provide detailed info about the pure intersection time.
            intersect_id = index_id + '#intersection'
            cr.start_split(intersect_id)
            # weightedIntersection preserves the values from any mappings
            # we get, as some indexes don't return simple sets.
            if hasattr(rs, 'items') or hasattr(index_rs, 'items'):
                _, rs = weightedIntersection(rs, index_rs)
            else:
                rs = intersection(rs, index_rs)

            cr.stop_split(intersect_id)

        # Consider the time it takes to intersect the index result
        # with the total result set to be part of the index time.
        cr.stop_split(index_id, result=index_rs, limit=limit_result)

        return rs
Пример #19
0
    def _apply_index(self, request, resultset=None):
        setlist = []
        indices_used = []
        for reltype in self.getIndexSourceNames():
            query = request.get(reltype)
            if query is None:
                continue

            if isinstance(query, str):
                target = query
            else:
                target = IUUID(query)

            indices_used.append(reltype)
            index = self._index[reltype]
            s = index.get(target)
            if s is None:
                continue
            else:
                setlist.append(s)

        if not indices_used:
            return

        if len(setlist) == 1:
            return setlist[0], tuple(indices_used)

        # If we already get a small result set passed in, intersecting
        # the various indexes with it and doing the union later is
        # faster than creating a multiunion first.
        if resultset is not None and len(resultset) < 200:
            smalllist = []
            for s in setlist:
                smalllist.append(intersection(resultset, s))
            r = multiunion(smalllist)
        else:
            r = multiunion(setlist)

        if r is None:
            r = IISet()
        return r, tuple(indices_used)
    def query_index(self, record, resultset=None):
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record, resultset)
            cached = cache.get(cachekey, None)
            if cached is not None:
                if resultset is None:
                    return cached
                else:
                    return difference(resultset, cached)

        term = self._convertDateTime(record.keys[0])
        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))
            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion([bounded, until_only, since_only,
                                 self._always])
            if cache is not None:
                cache[cachekey] = result

            return result
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([since, since_only, until_only, until])
            if cache is not None:
                cache[cachekey] = result

            return difference(resultset, result)
Пример #21
0
    def _apply_index(self, request, cid=''):
        record = parseIndexRequest(request, self.getId(), self.query_options)
        if record.keys is None:
            return None

        catalog = getToolByName(self, 'portal_catalog')

        geoIndex = catalog._catalog.getIndex(self.geoindex_id)
        geoRequest = {}
        geoRequest[self.geoindex_id] = {
            'query': record.keys, 'range': record.range}
        geo_response = geoIndex._apply_index(geoRequest, raw=True)

        paths = {}
        for item in geo_response:
            paths[int(item['id'])] = item['properties']['path']

        rolesIndex = catalog._catalog.getIndex('allowedRolesAndUsers')
        user = _getAuthenticatedUser(self)
        perms_set = rolesIndex._apply_index(
            {'allowedRolesAndUsers': catalog._listAllowedRolesAndUsers(user)}
            )[0]

        r = intersection(perms_set, IISet(paths.keys()))

        if isinstance(r, int):
            r = IISet((r,))
        if r is None:
            return IISet(), (self.getId(),)

        else:
            url_tool = getToolByName(self, 'portal_url')
            portal_path = url_tool.getPortalObject().getPhysicalPath()
            root = list(portal_path)
            def up(path):
                return '/'.join(root + path.strip('/').split('/')[:-1])
            return union(
                r,
                IISet([catalog.getrid(up(paths[lid])) for lid in r])
                ), (self.getId(),)
Пример #22
0
def keywords_filtered_by_context(context, index_name='Subject'):
    """valid subjects under the given context
    """
    catalog_tool = api.portal.get_tool('portal_catalog')
    catalog = catalog_tool._catalog
    path_idx = catalog.indexes['path']
    tags_idx = catalog.indexes[index_name]
    result = []
    # query all oids of path - low level
    path_query = {
        'path': {
            'query': '/'.join(context.getPhysicalPath()),
            'depth': -1,
        }
    }
    path_result, info = path_idx._apply_index(path_query)
    for tag in tags_idx.uniqueValues():
        keyword_query = {index_name: tag}
        tags_result, info = tags_idx._apply_index(keyword_query)
        if intersection(path_result, tags_result):
            result.append(tag)
    return result
def group(self, seq):
  sortIndex = self._sortIndex;
  sortReverse = self._sortReverse
  ns = len(seq); ni = len(sortIndex)

  if ns >= 0.1 * ni:
    # result large compared to index -- sort via index
    handled = IISet();
    hn = 0
    _load = getattr(sortIndex, '_load', None)
    if _load is None:
      # not an optimized index
      items = sortIndex.items()
      _load = lambda (x1, x2): x2
      if sortReverse:
          items.reverse()
    elif sortReverse:
      gRO = getattr(sortIndex, 'getReverseOrder', None)
      items = gRO and gRO()
      if items is None:
        items = list(sortIndex._index.keys());
        items.reverse()
    else:
        items = sortIndex._index.keys()

    for i in items:
      ids = intersection(seq, _load(i))
      if ids:
        handled.update(ids);
        hn += len(ids)
        yield i, ids
    if hn != len(seq):
        yield None, difference(seq, handled)
  else:
    # result relatively small -- sort via result
    m = OOBTree()
    keyFor = getattr(sortIndex, 'keyForDocument', None)
    # work around "nogopip" bug: it defines "keyForDocument" as an integer
    if not callable(keyFor):
      # this will fail, when the index neither defines a reasonable
      # "keyForDocument" nor "documentToKeyMap". In this case,
      # the index cannot be used for sorting.
      keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc]
    noValue = IITreeSet()

    for doc in seq.keys():
      try: k = keyFor(doc)
      except KeyError: noValue.insert(doc); continue

      k = NaturalObjectCompare( k)
      l = m.get(k)
      if l is None: l = m[k] = IITreeSet()
      l.insert(doc)
    items = m.items()
    if sortReverse:
        items = list(items);
        items.reverse()

    for i in items:
        yield i
    if noValue: yield None, noValue
Пример #24
0
    def search(self,
            query, sort_index=None, reverse=False, limit=None, merge=True):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        rs = None  # result set
        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 4
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                # weightedIntersection preserves the values from any mappings
                # we get, as some indexes don't return simple sets
                if hasattr(rs, 'items') or hasattr(r, 'items'):
                    _, rs = weightedIntersection(rs, r)
                else:
                    rs = intersection(rs, r)

                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result
                # with the total result set to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if sort_index is None:
            sort_report_name = None
        else:
            if isinstance(sort_index, list):
                sort_name = '-'.join(i.getId() for i in sort_index)
            else:
                sort_name = sort_index.getId()
            if isinstance(reverse, list):
                reverse_name = '-'.join(
                    'desc' if r else 'asc' for r in reverse)
            else:
                reverse_name = 'desc' if reverse else 'asc'
            sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name
            if limit is not None:
                sort_report_name += '#limit-%s' % limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 4 this will result in an empty LazyCat '
                          'to be returned.' % repr(cr.make_key(query)),
                          DeprecationWarning, stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                    b_start, b_size)
                result = LazyMap(self.instantiate, sequence, slen,
                    actual_result_count=rlen)
            else:
                cr.start_split(sort_report_name)
                result = self.sortResults(
                    self.data, sort_index, reverse, limit, merge,
                        actual_result_count=rlen, b_start=b_start,
                        b_size=b_size)
                cr.stop_split(sort_report_name, None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'items'):
                # having a 'items' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                            for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on#score')

                    # sort it by score
                    rs = rs.byValue(0)
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        data = self.data[key]
                        klass = self._v_result_class
                        schema_len = len(klass.__record_schema__)
                        norm_score = int(100.0 * score / max)
                        if schema_len == len(data) + 3:
                            r = klass(tuple(data) + (key, score, norm_score))
                        else:
                            r = klass(data)
                            r.data_record_id_ = key
                            r.data_record_score_ = score
                            r.data_record_normalized_score_ = norm_score
                        r = r.__of__(aq_parent(self))
                        return r

                    sequence, slen = self._limit_sequence(rs, rlen, b_start,
                        b_size)
                    result = LazyMap(getScoredResult, sequence, slen,
                        actual_result_count=rlen)
                    cr.stop_split('sort_on#score', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                    b_size)
                result = LazyMap(self.__getitem__, sequence, slen,
                    actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split(sort_report_name)
                result = self.sortResults(rs, sort_index, reverse, limit,
                    merge, actual_result_count=rlen, b_start=b_start,
                    b_size=b_size)
                cr.stop_split(sort_report_name, None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result
def unindex_apply_index(self, request, cid='', type=type, res=None):
    record = parseIndexRequest(request, self.id, self.query_options)
    if record.keys==None: return None

    index = self._index
    r     = None
    opr   = None

    # experimental code for specifing the operator
    operator = record.get('operator',self.useOperator)
    if not operator in self.operators :
        raise RuntimeError,"operator not valid: %s" % escape(operator)

    # depending on the operator we use intersection or union
    if operator=="or":  set_func = union
    else:               set_func = intersection

    # Range parameter
    range_parm = record.get('range',None)
    if range_parm:
        opr = "range"
        opr_args = []
        if range_parm.find("min")>-1:
            opr_args.append("min")
        if range_parm.find("max")>-1:
            opr_args.append("max")

    if record.get('usage',None):
        # see if any usage params are sent to field
        opr = record.usage.lower().split(':')
        opr, opr_args=opr[0], opr[1:]

    if opr=="range":   # range search
        if 'min' in opr_args: lo = min(record.keys)
        else: lo = None
        if 'max' in opr_args: hi = max(record.keys)
        else: hi = None
        if hi:
            setlist = index.values(lo,hi)
        else:
            setlist = index.values(lo)


        # If we only use 1 key (default setting), intersect and return immediately
        if len(setlist) == 1:
            result = setlist[0]
            if isinstance(result, int):
                result = IISet((result,))
            return result, (self.id,)

        if operator == 'or':
            r = multiunion(setlist)
        else:
            # For intersection, sort with smallest data set first
            tmp = []
            for s in setlist:
                if isinstance(s, int):
                    s = IISet((s,))
                tmp.append(s)
            if len(tmp) > 2:
                setlist = sorted(tmp, key=len)
            else:
                setlist = tmp
            r = res
            for s in setlist:
                r = intersection(r, s)

    else: # not a range search
        # Filter duplicates, and sort by length
        keys = set(record.keys)
        setlist = []
        for k in keys:
            s = index.get(k, None)
            # If None, try to bail early
            if s is None:
                if operator == 'or':
                    # If union, we can't possibly get a bigger result
                    continue
                # If intersection, we can't possibly get a smaller result
                return IISet(), (self.id,)
            elif isinstance(s, int):
                s = IISet((s,))
            setlist.append(s)

        # If we only use 1 key (default setting), intersect and return immediately
        if len(setlist) == 1:
            result = setlist[0]
            if isinstance(result, int):
                result = IISet((result,))
            return result, (self.id,)

        if operator == 'or':
            # If we already get a small result set passed in, intersecting
            # the various indexes with it and doing the union later is faster
            # than creating a multiunion first.
            if res is not None and len(res) < 200:
                smalllist = []
                for s in setlist:
                    smalllist.append(intersection(res, s))
                r = multiunion(smalllist)
            else:
                r = multiunion(setlist)
        else:
            # For intersection, sort with smallest data set first
            if len(setlist) > 2:
                setlist = sorted(setlist, key=len)
            r = res
            for s in setlist:
                r = intersection(r, s)

    if isinstance(r, int):  r=IISet((r,))
    if r is None:
        return IISet(), (self.id,)
    else:
        return r, (self.id,)
Пример #26
0
    def query_index(self, record, resultset=None):
        """Search the index with the given IndexQuery object.

        If not `None`, the resultset argument
        indicates that the search result is relevant only on this set,
        i.e. everything outside resultset is of no importance.
        The index can use this information for optimizations.
        """
        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)

        operator = record.operator

        cachekey = None
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record)
            if cachekey is not None:
                cached = None
                if operator == 'or':
                    cached = cache.get(cachekey, None)
                else:
                    cached_setlist = cache.get(cachekey, None)
                    if cached_setlist is not None:
                        r = resultset
                        for s in cached_setlist:
                            # the result is bound by the resultset
                            r = intersection(r, s)
                            # If intersection, we can't possibly get a
                            # smaller result
                            if not r:
                                break
                        cached = r

                if cached is not None:
                    if isinstance(cached, int):
                        cached = IISet((cached, ))

                    if not_parm:
                        not_parm = list(map(self._convert, not_parm))
                        exclude = self._apply_not(not_parm, resultset)
                        cached = difference(cached, exclude)

                    return cached

        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = list(map(self._convert, not_parm))
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = list(map(self._convert, record.keys))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = 'range'
            opr_args = []
            if range_parm.find('min') > -1:
                opr_args.append('min')
            if range_parm.find('max') > -1:
                opr_args.append('max')

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == 'range':  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                r = multiunion(tmp)

                if cachekey is not None:
                    cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp

                # 'r' is not invariant of resultset. Thus, we
                # have to remember 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    # Prevent None from being looked up. None doesn't
                    # have a valid ordering definition compared to any
                    # other object. BTrees 4.0+ will throw a TypeError
                    # "object has default comparison".
                    continue
                try:
                    s = index.get(k, None)
                except TypeError:
                    # key is not valid for this Btree so the value is None
                    LOG.error(
                        '%(context)s: query_index tried '
                        'to look up key %(key)r from index %(index)r '
                        'but key was of the wrong type.', dict(
                            context=self.__class__.__name__,
                            key=k,
                            index=self.id,
                        )
                    )
                    s = None
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    if cachekey is not None:
                        # If operator is 'and', we have to cache a list of
                        # IISet objects
                        cache[cachekey] = [IISet()]
                    return IISet()
                elif isinstance(s, int):
                    s = IISet((s,))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.

                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)

                    # 'r' is not invariant of resultset.  Thus, we
                    # have to remember the union of 'setlist'. But
                    # this is maybe a performance killer. So we do not cache.
                    # if cachekey is not None:
                    #    cache[cachekey] = multiunion(setlist)

                else:
                    r = multiunion(setlist)
                    if cachekey is not None:
                        cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)

                # 'r' is not invariant of resultset. Thus, we
                # have to remember the union of 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet()
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r
Пример #27
0
    def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1,
                    actual_result_count=None, b_start=0, b_size=None):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        append = result.append
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the resultset, reverse sorting
        # order and limit it, then reverse the resultset again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        if merge and limit is None and (
            rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # This is rarely exercised in practice...

            length = 0

            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = _intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', None)
                    if keys is not None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.

            if reverse:
                result.sort(reverse=True)
            else:
                result.sort()
            sequence, slen = self._limit_sequence(result, length, b_start,
                b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    append((key, did, _self__getitem__))
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
            if merge:
                if reverse:
                    result.sort(reverse=True)
                else:
                    result.sort()
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence
        elif reverse:
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence
        elif not reverse:
            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            n = 0
            best = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence

        return LazyMap(self.__getitem__, result, len(result),
            actual_result_count=actual_result_count)
def dateindex_apply_index(self, request, cid="", type=type, res=None):
    record = parseIndexRequest(request, self.id, self.query_options)
    if record.keys == None:
        return None

    keys = map(self._convert, record.keys)

    index = self._index
    r = None
    opr = None

    # experimental code for specifing the operator
    operator = record.get("operator", self.useOperator)
    if not operator in self.operators:
        raise RuntimeError, "operator not valid: %s" % operator

    # depending on the operator we use intersection or union
    if operator == "or":
        set_func = union
    else:
        set_func = intersection

    # range parameter
    range_arg = record.get("range", None)
    if range_arg:
        opr = "range"
        opr_args = []
        if range_arg.find("min") > -1:
            opr_args.append("min")
        if range_arg.find("max") > -1:
            opr_args.append("max")

    if record.get("usage", None):
        # see if any usage params are sent to field
        opr = record.usage.lower().split(":")
        opr, opr_args = opr[0], opr[1:]

    if opr == "range":  # range search
        if "min" in opr_args:
            lo = min(keys)
        else:
            lo = None

        if "max" in opr_args:
            hi = max(keys)
        else:
            hi = None

        if hi:
            setlist = index.values(lo, hi)
        else:
            setlist = index.values(lo)

        # for k, set in setlist:
        # if type(set) is IntType:
        # set = IISet((set,))
        # r = set_func(r, set)
        # XXX: Use multiunion!
        r = multiunion(setlist)

    else:  # not a range search
        for key in keys:
            set = index.get(key, None)
            if set is not None:
                if isinstance(set, int):
                    set = IISet((set,))
                else:
                    # set can't be bigger than res
                    set = intersection(set, res)
                r = set_func(r, set)

    if isinstance(r, int):
        r = IISet((r,))

    if r is None:
        return IISet(), (self.id,)
    else:
        return r, (self.id,)
Пример #29
0
    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=0,
                    limit=None,
                    merge=1,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        _None = None
        _keyerror = KeyError
        result = []
        append = result.append
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the resultset, reverse sorting
        # order and limit it, then reverse the resultset again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        if merge and limit is None and (rlen >
                                        (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # This is rarely exercised in practice...

            length = 0

            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = _intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', _None)
                    if keys is not _None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.

            if reverse:
                result.sort(reverse=True)
            else:
                result.sort()
            sequence, slen = self._limit_sequence(result, length, b_start,
                                                  b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    append((key, did, _self__getitem__))
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
            if merge:
                if reverse:
                    result.sort(reverse=True)
                else:
                    result.sort()
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif reverse:
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif not reverse:
            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            n = 0
            best = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)
Пример #30
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the argument

        Normalize the 'query' arguments into integer values at minute
        precision before querying.
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        keys = map( self._convert, record.keys )

        index = self._index
        r = None
        opr = None

        #experimental code for specifing the operator
        operator = record.get( 'operator', self.useOperator )
        if not operator in self.operators :
            raise RuntimeError("operator not valid: %s" % operator)

        # depending on the operator we use intersection or union
        if operator=="or":
            set_func = union
        else:
            set_func = intersection

        # range parameter
        range_arg = record.get('range',None)
        if range_arg:
            opr = "range"
            opr_args = []
            if range_arg.find("min") > -1:
                opr_args.append("min")
            if range_arg.find("max") > -1:
                opr_args.append("max")

        if record.get('usage',None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr=="range":   # range search
            if 'min' in opr_args:
                lo = min(keys)
            else:
                lo = None

            if 'max' in opr_args:
                hi = max(keys)
            else:
                hi = None

            if hi:
                setlist = index.values(lo,hi)
            else:
                setlist = index.values(lo)

            r = multiunion(setlist)

        else: # not a range search
            for key in keys:
                set = index.get(key, None)
                if set is not None:
                    if isinstance(set, int):
                        set = IISet((set,))
                    else:
                        # set can't be bigger than resultset
                        set = intersection(set, resultset)
                    r = set_func(r, set)

        if isinstance(r, int):
            r = IISet((r,))

        if r is None:
            return IISet(), (self.id,)
        else:
            return r, (self.id,)
Пример #31
0
    def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        rs = None  # resultset

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 2.14
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                # weightedIntersection preserves the values from any mappings
                # we get, as some indexes don't return simple sets
                if hasattr(rs, 'items'):
                    _, rs = weightedIntersection(rs, r)
                else:
                    rs = intersection(rs, r)

                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result with
                # the total resultset to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 2.14 this will result in an empty LazyCat '
                          'to be returned.' % repr(cr.make_key(query)),
                          DeprecationWarning,
                          stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                                                      b_start, b_size)
                result = LazyMap(self.instantiate,
                                 sequence,
                                 slen,
                                 actual_result_count=rlen)
            else:
                cr.start_split('sort_on')
                result = self.sortResults(self.data,
                                          sort_index,
                                          reverse,
                                          limit,
                                          merge,
                                          actual_result_count=rlen,
                                          b_start=b_start,
                                          b_size=b_size)
                cr.stop_split('sort_on', None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'items'):
                # having a 'items' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                              for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on')

                    rs = rs.byValue(0)  # sort it by score
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        r=self._v_result_class(self.data[key])\
                              .__of__(aq_parent(self))
                        r.data_record_id_ = key
                        r.data_record_score_ = score
                        r.data_record_normalized_score_ = int(100. * score /
                                                              max)
                        return r

                    sequence, slen = self._limit_sequence(
                        rs, rlen, b_start, b_size)
                    result = LazyMap(getScoredResult,
                                     sequence,
                                     slen,
                                     actual_result_count=rlen)
                    cr.stop_split('sort_on', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                                                      b_size)
                result = LazyMap(self.__getitem__,
                                 sequence,
                                 slen,
                                 actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split('sort_on')
                result = self.sortResults(rs,
                                          sort_index,
                                          reverse,
                                          limit,
                                          merge,
                                          actual_result_count=rlen,
                                          b_start=b_start,
                                          b_size=b_size)
                cr.stop_split('sort_on', None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result
Пример #32
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the argument

        Normalize the 'query' arguments into integer values at minute
        precision before querying.
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        keys = map(self._convert, record.keys)

        index = self._index
        r = None
        opr = None

        #experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % operator)

        # depending on the operator we use intersection or union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        # range parameter
        range_arg = record.get('range', None)
        if range_arg:
            opr = "range"
            opr_args = []
            if range_arg.find("min") > -1:
                opr_args.append("min")
            if range_arg.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(keys)
            else:
                lo = None

            if 'max' in opr_args:
                hi = max(keys)
            else:
                hi = None

            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            r = multiunion(setlist)

        else:  # not a range search
            for key in keys:
                set = index.get(key, None)
                if set is not None:
                    if isinstance(set, int):
                        set = IISet((set, ))
                    else:
                        # set can't be bigger than resultset
                        set = intersection(set, resultset)
                    r = set_func(r, set)

        if isinstance(r, int):
            r = IISet((r, ))

        if r is None:
            return IISet(), (self.id, )
        else:
            return r, (self.id, )
Пример #33
0
    def _apply_index(self, request, resultset=None):
        """
            Apply the index to query parameters given in 'request', which
            should be a mapping object.

            If the request does not contain the needed parameters, then
            return None.

            Otherwise return two objects.  The first object is a ResultSet
            containing the record numbers of the matching records.  The
            second object is a tuple containing the names of all data fields
            used.
        """
        iid = self.id
        record = parseIndexRequest(request, iid, self.query_options)
        if record.keys is None:
            return None

        term = self._convertDateTime(record.keys[0])
        REQUEST = aq_get(self, 'REQUEST', None)
        if REQUEST is not None:
            catalog = aq_parent(aq_parent(aq_inner(self)))
            if catalog is not None:
                key = self._cache_key(catalog)
                cache = REQUEST.get(key, None)
                tid = isinstance(term, int) and term / 10 or 'None'
                if resultset is None:
                    cachekey = '_daterangeindex_%s_%s' % (iid, tid)
                else:
                    cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid)
                if cache is None:
                    cache = REQUEST[key] = RequestCache()
                else:
                    cached = cache.get(cachekey, None)
                    if cached is not None:
                        if resultset is None:
                            return (cached, (self._since_field,
                                             self._until_field))
                        else:
                            return (difference(resultset, cached),
                                    (self._since_field, self._until_field))

        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))

            # Total result is bound by resultset
            if REQUEST is None:
                until = intersection(resultset, until)

            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion(
                [bounded, until_only, since_only, self._always])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (result, (self._since_field, self._until_field))
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([until_only, since_only, until, since])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (difference(resultset,
                               result), (self._since_field, self._until_field))
Пример #34
0
    def sortResults(self, rs, sort_index, reverse=False, limit=None,
            merge=True, actual_result_count=None, b_start=0, b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        r_append = result.append
        r_insert = result.insert
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        if merge and limit is None and (
           rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # TODO: len(sort_index) isn't actually what we want for a keyword
            # index, as it's only the unique values, not the documents.
            # Don't use this case while using limit, as we return results of
            # non-flattened intsets, and would have to merge/unflattened those
            # before limiting.
            length = 0
            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            if sort_index_length == 1:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        r_append((k, intset, _self__getitem__))
                result.sort(reverse=reverse)
            else:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        # sort on secondary index
                        keysets = defaultdict(list)
                        for i in intset:
                            full_key = (k, )
                            for km in second_indexes_key_map:
                                try:
                                    full_key += (km[i], )
                                except KeyError:
                                    pass
                            keysets[full_key].append(i)
                        for k2, v2 in keysets.items():
                            r_append((k2, v2, _self__getitem__))
                result = multisort(result, sort_spec)
            sequence, slen = self._limit_sequence(result, length, b_start,
                b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index.
            # If we are interested in at least 25% or more of the result set,
            # the N-Best algorithm is slower, so we iterate over all.
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        # The reference back to __getitem__ is used in case
                        # we do not merge now and need to intermingle the
                        # results with those of other catalogs while avoiding
                        # the cost of instantiating a LazyMap per result
                        r_append((key, did, _self__getitem__))
                if merge:
                    result.sort(reverse=reverse)
            else:
                for did in rs:
                    try:
                        full_key = (index_key_map[did], )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        r_append((full_key, did, _self__getitem__))
                if merge:
                    result = multisort(result, sort_spec)
            if merge:
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence
        elif first_reverse:
            # Limit / sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            k_insert = keys.insert
            n = 0
            worst = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result.reverse()
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence
        elif not first_reverse:
            # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            k_insert = keys.insert
            n = 0
            best = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence

        return LazyMap(self.__getitem__, result, len(result),
            actual_result_count=actual_result_count)
Пример #35
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the request arg.

        The request argument should be a mapping object.

        If the request does not have a key which matches the "id" of
        the index instance, then None is returned.

        If the request *does* have a key which matches the "id" of
        the index instance, one of a few things can happen:

          - if the value is a blank string, None is returned (in
            order to support requests from web forms where
            you can't tell a blank string from empty).

          - if the value is a nonblank string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are "or"
            and "and".

        If None is not returned as a result of the abovementioned
        constraints, two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.

        FAQ answer:  to search a Field Index for documents that
        have a blank string as their value, wrap the request value
        up in a tuple ala: request = {'id':('',)}
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        r = None
        opr = None

        # experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % escape(operator))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args: lo = min(record.keys)
            else: lo = None
            if 'max' in opr_args: hi = max(record.keys)
            else: hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                return result, (self.id, )

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                r = multiunion(tmp)
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp
                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can't possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    return IISet(), (self.id, )
                elif isinstance(s, int):
                    s = IISet((s, ))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                return result, (self.id, )

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.
                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)
                else:
                    r = multiunion(setlist)
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)
                r = resultset
                for s in setlist:
                    r = intersection(r, s)

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet(), (self.id, )
        else:
            return r, (self.id, )
Пример #36
0
    def _select_rids(self, query):
        """Searches the table for matches, returning record ids.

        Returns a sequence of record ids, or None for all records.
        """
        primary_key = []
        params = 0  # The number of parameters specified
        primary_params = 0  # The number of primary params specified
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                params += 1
                if column.primary:
                    primary_params += 1
                    if primary_key is not None:
                        primary_key.append(value)
            elif column.primary:
                # Didn't fully specify the primary key.
                # Can't search by primary key.
                primary_key = None

        if not params:
            # No query.  Select all.
            return None

        # First strategy: try to satisfy the request by consulting
        # the primary key index.
        if primary_key:
            # The primary key is complete.  The result set will have
            # either zero rows or one row.
            primary_key = tuple(primary_key)
            rid = self.primary_index.get(primary_key)
            if rid is None:
                return ()
            # Possibly filter out the single item.
            if params > primary_params:
                cand = self.data[rid]
                for position, column in self.col_info:
                    if query[position] is not None:
                        if cand[position] != query[position]:
                            # Not a match.
                            return ()
            return (rid, )

        # Second strategy: try to satisfy the request by intersecting
        # indexes.
        rids = None
        iteration_filters = []
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                index = self.indexes.get(column.name)
                if index is None:
                    iteration_filters.append((position, value))
                else:
                    set = index.get(value)
                    if set is None:
                        # No rows satisfy this criterion.
                        return ()
                    if rids is None:
                        rids = set
                    else:
                        rids = intersection(rids, set)
                    if not rids:
                        # No rows satisfy all criteria.
                        return ()
        if rids is not None:
            rids = rids.keys()

        if not iteration_filters:
            # Indexes did all the work.  No need to search each record.
            return rids

        # Fallback strategy: Eliminate items one by one.
        if rids is None:
            # Use the whole data set.
            candidates = self.data.values()
        else:
            # Use the specified records.
            candidates = [self.data[rid] for rid in rids]

        rids = []
        append = rids.append
        for cand in candidates:
            for position, value in iteration_filters:
                if cand[position] != value:
                    # Not a match.
                    break
            else:
                # A match.
                append(cand[0])
        return rids
Пример #37
0
    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=False,
                    limit=None,
                    merge=True,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        r_append = result.append
        r_insert = result.insert
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        if merge and (rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # TODO: len(sort_index) isn't actually what we want for a keyword
            # index, as it's only the unique values, not the documents.
            length = 0
            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            if sort_index_length == 1:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        r_append((k, intset, _self__getitem__))
                result.sort(reverse=reverse)
            else:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        # sort on secondary index
                        keysets = defaultdict(list)
                        for i in intset:
                            full_key = (k, )
                            for km in second_indexes_key_map:
                                try:
                                    full_key += (km[i], )
                                except KeyError:
                                    pass
                            keysets[full_key].append(i)
                        for k2, v2 in keysets.items():
                            r_append((k2, v2, _self__getitem__))
                result = multisort(result, sort_spec)
            sequence, slen = self._limit_sequence(result, length, b_start,
                                                  b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index.
            # If we are interested in at least 25% or more of the result set,
            # the N-Best algorithm is slower, so we iterate over all.
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        # The reference back to __getitem__ is used in case
                        # we do not merge now and need to intermingle the
                        # results with those of other catalogs while avoiding
                        # the cost of instantiating a LazyMap per result
                        r_append((key, did, _self__getitem__))
                if merge:
                    result.sort(reverse=reverse)
            else:
                for did in rs:
                    try:
                        full_key = (index_key_map[did], )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        r_append((full_key, did, _self__getitem__))
                if merge:
                    result = multisort(result, sort_spec)
            if merge:
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif first_reverse:
            # Limit / sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            k_insert = keys.insert
            n = 0
            worst = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result.reverse()
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                               switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence
        elif not first_reverse:
            # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            k_insert = keys.insert
            n = 0
            best = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                               switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)
Пример #38
0
    def search(self, path, default_level=0, depth=-1, navtree=0,
                                                             navtree_start=0):
        """
        path is either a string representing a
        relative URL or a part of a relative URL or
        a tuple (path,level).

        level >= 0  starts searching at the given level
        level <  0  not implemented yet
        """

        if isinstance(path, basestring):
            startlevel = default_level
        else:
            startlevel = int(path[1])
            path = path[0]

        absolute_path = isinstance(path, basestring) and path.startswith('/')
        comps = filter(None, path.split('/'))

        orig_comps = [''] + comps[:]

        if depth > 0:
            raise ValueError, "Can't do depth searches anymore"
        if not comps:
            comps = ['dmd']
            startlevel = 1
        else:
            if comps[0] == getCSEConf().get('virtualroot', '').replace('/', ''):
                comps = comps[1:]
            if comps[0] == 'zport':
                comps = comps[1:]

        if comps[0] != 'dmd':
            raise ValueError, "Depth searches must start with 'dmd'"
        startlevel = len(comps)

        if len(comps) == 0:
            if depth == -1 and not navtree:
                return IISet(self._unindex.keys())

        # Make sure that we get depth = 1 if in navtree mode
        # unless specified otherwise

        orig_depth = depth
        if depth == -1:
            depth = 0 or navtree

        # Optimized navtree starting with absolute path
        if absolute_path and navtree and depth == 1 and default_level==0:
            set_list = []
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    set_list.append(self._index_parents[parent_path])
                except KeyError:
                    pass
            return multiunion(set_list)
        # Optimized breadcrumbs
        elif absolute_path and navtree and depth == 0 and default_level==0:
            item_list = IISet()
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    item_list.insert(self._index_items[parent_path])
                except KeyError:
                    pass
            return item_list
        # Specific object search
        elif absolute_path and orig_depth == 0 and default_level == 0:
            try:
                return IISet([self._index_items[path]])
            except KeyError:
                return IISet()
        # Single depth search
        elif absolute_path and orig_depth == 1 and default_level == 0:
            # only get objects contained in requested folder
            try:
                return self._index_parents[path]
            except KeyError:
                return IISet()
        # Sitemaps, relative paths, and depth queries
        elif startlevel >= 0:

            pathset = None # Same as pathindex
            navset  = None # For collecting siblings along the way
            depthset = None # For limiting depth

            if navtree and depth and \
                   self._index.has_key(None) and \
                   self._index[None].has_key(startlevel):
                navset = self._index[None][startlevel]
            for level in range(startlevel, startlevel+len(comps)):
                if level <= len(comps):
                    comp = "/".join(comps[:level])
                    if (not self._index.has_key(comp)
                        or not self._index[comp].has_key(level)):
                        # Navtree is inverse, keep going even for
                        # nonexisting paths
                        if navtree:
                            pathset = IISet()
                        else:
                            return IISet()
                    else:
                        return self._index[comp][level]
                    if navtree and depth and \
                           self._index.has_key(None) and \
                           self._index[None].has_key(level+depth):
                        navset  = union(navset, intersection(pathset,
                                              self._index[None][level+depth]))
                if level-startlevel >= len(comps) or navtree:
                    if (self._index.has_key(None)
                        and self._index[None].has_key(level)):
                        depthset = union(depthset, intersection(pathset,
                                                    self._index[None][level]))

            if navtree:
                return union(depthset, navset) or IISet()
            elif depth:
                return depthset or IISet()
            else:
                return pathset or IISet()

        else:
            results = IISet()
            for level in range(0,self._depth + 1):
                ids = None
                error = 0
                for cn in range(0,len(comps)):
                    comp = comps[cn]
                    try:
                        ids = intersection(ids,self._index[comp][level+cn])
                    except KeyError:
                        error = 1
                if error==0:
                    results = union(results,ids)
            return results
Пример #39
0
from BTrees.IIBTree import IISet, union, intersection, difference


def make_choice(data, per):
    data_len = len(data)
    return [choice(data) for i in range(0, data_len * float(per) / 100.0)]


for max in (500, 2500, 5000, 10000, 25000, 50000, 100000):
    data = range(max)

    for p1, p2 in ((25, 25), (25, 50), (25, 75), (25, 100), (50, 50), (50, 75),
                   (50, 100), (75, 75), (75, 100), (100, 100)):

        d1 = IISet(make_choice(data, p1))
        d2 = IISet(make_choice(data, p2))

        ts = time.time()
        union(d1, d2)
        tu = time.time() - ts

        ts = time.time()
        intersection(d1, d2)
        ti = time.time() - ts

        ts = time.time()
        difference(d1, d2)
        td = time.time() - ts

        print '%6d %3d:%3d  %6.6f  %6.6f %6.6f' % (max, p1, p2, tu, ti, td)