def _apply_index(self, request): record = parseIndexRequest(request, self.id) try: qstart, qend = record.keys except TypeError: return None minint = BTrees.family64.minint maxint = BTrees.family64.maxint qstart = min(maxint, max(minint, qstart)) qend = max(minint, min(maxint, qend)) # start in inside range start = multiunion(self._since_index.values(max=qstart)) end = multiunion(self._until_index.values(min=qstart)) start_into = intersection(start, end) # end inside range start = multiunion(self._since_index.values(max=qend)) end = multiunion(self._until_index.values(min=qend)) end_into = intersection(start, end) # start before range and end after range start = multiunion(self._since_index.values(min=qstart)) end = multiunion(self._until_index.values(max=qend)) start_before_end_after = intersection(start, end) result = union(start_into, end_into) result = union(result, start_before_end_after) return multiunion(map(self._index.__getitem__, result)), (self.id,)
def search(self,path,default_level=0): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path,level). level>=0 starts searching at the given level level<0 not implemented yet """ if isinstance(path,StringType): level = default_level else: level = int(path[1]) path = path[0] comps = self.splitPath(path) if len(comps) == 0: return IISet(self._unindex.keys()) if level >=0: results = [] for i in range(len(comps)): comp = comps[i] if not self._index.has_key(comp): return IISet() if not self._index[comp].has_key(level+i): return IISet() results.append( self._index[comp][level+i] ) res = results[0] for i in range(1,len(results)): res = intersection(res,results[i]) return res else: results = IISet() for level in range(0,self._depth + 1): ids = None error = 0 for cn in range(0,len(comps)): comp = comps[cn] try: ids = intersection(ids,self._index[comp][level+cn]) except KeyError: error = 1 if error==0: results = union(results,ids) return results
def timing(self, small, large, text=''): new = 0.0 old = 0.0 c = 0.0 loop = LOOP for i in xrange(loop): start = time() intersection2(small, large) new+=(time()-start) start = time() intersection(small, large) old+=(time()-start) start = time() ciiintersection(small, large) c+=(time()-start) new_ratio = old / new c_ratio = old / c new_report = False if new_ratio <= 0.4 or new_ratio > 2: new_report = True c_report = False if c_ratio <= 0.8 or c_ratio > 1.2: c_report = True if c_report or new_report: print print text print 'Old x%s: %.6f' % (loop, old) print 'New x%s: %.6f - factor: %.2f' % (loop, new, new_ratio) print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, c_ratio)
def _sort_iterate_index(self, actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. # Don't use this case while using limit, as we return results of # non-flattened intsets, and would have to merge/unflattened those # before limiting. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) result.append((k, intset, self.__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): result.append((k2, v2, self.__getitem__)) result = multisort(result, sort_spec) return (actual_result_count, length, result)
def train(self): """ """ catalog = getToolByName(self, "portal_catalog") presentNouns = dict() trainingData = [] allNouns = catalog.uniqueValuesFor("noun_terms") for item in allNouns: presentNouns.setdefault(item, 0) subjectIndex = catalog._catalog.getIndex("Subject") nounTermsIndex = catalog._catalog.getIndex("noun_terms") # The internal catalog ids of the objects # that have noun terms in the catalog nounTermIndexIds = IISet(nounTermsIndex._unindex.keys()) # The internal catalog ids of the objects # that have subjects in the catalog subjectIndexIds = IISet(subjectIndex._unindex.keys()) commonIds = intersection(subjectIndexIds, nounTermIndexIds) for cid in commonIds: nounPresence = presentNouns.copy() nouns = nounTermsIndex._unindex[cid] tags = subjectIndex._unindex[cid] for noun in nouns: nounPresence[noun] = 1 for tag in tags: trainingData.append((nounPresence, tag)) if trainingData: self.classifier = NaiveBayesClassifier.train(trainingData)
def keywords_of_section(self, section, kwfilter): """Valid keywords under the given section. """ pcat = getToolByName(section, 'portal_catalog') cat = pcat._catalog path_idx = cat.indexes[self.path_index] tags_idx = cat.indexes[self.keyword_index] result = [] # query all oids of path - low level pquery = { self.path_index: { 'query': '/'.join(section.getPhysicalPath()), 'depth': -1, } } kwfilter = safe_encode(kwfilter) # uses internal zcatalog specific details to quickly get the values. path_result, info = path_idx._apply_index(pquery) for tag in tags_idx.uniqueValues(): if kwfilter and kwfilter not in safe_encode(tag): continue tquery = {self.keyword_index: tag} tags_result, info = tags_idx._apply_index(tquery) if intersection(path_result, tags_result): result.append(tag) # result should be sorted, because uniqueValues are. return safe_simplevocabulary_from_values(result)
def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = IITreeSet(old_wid2w.keys()) new_widset = IITreeSet(new_wid2w.keys()) in_both_widset = intersection(old_widset, new_widset) only_old_widset = difference(old_widset, in_both_widset) only_new_widset = difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = WidCode.encode(new_wids) return len(new_wids)
def timing(self, small, large): new = 0.0 old = 0.0 new2 = 0.0 c = 0.0 loop = LOOP for i in xrange(loop): start = time() res = intersection2(small, large) new+=(time()-start) start = time() res = intersection(small, large) old+=(time()-start) if ciiintersection is not None: start = time() res = ciiintersection(small, large) c+=(time()-start) print 'Old x%s: %.6f' % (loop, old) print 'New x%s: %.6f' % (loop, new) if ciiintersection is not None: print 'Cyt x%s: %.6f' % (loop, c)
def below(self, arg): """Find all resources at or below path, within the limits given. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== parts = path.split(os.sep) rids = None for level in range(len(parts)): rids = intersection(rids, self.parts[(level, parts[level])]) if rids is None: return IISet() # short-cut # Limits # ====== # Remove rids that are above any upper limit, and then only include rids # that are above any lower limit. Limits are relative to the level of # the requested path. if upper is not None: upper += level for i in range(level, upper): if i not in self.levels: break rids = difference(rids, self.levels[i]) if lower is not None: lower += level _rids = [] for i in range(level, lower): if i not in self.levels: break _rids.append(self.levels[i]) rids = intersection(rids, multiunion(_rids)) return rids
def _apply_index( self, request, cid='' ): """ Apply the index to query parameters given in 'request', which should be a mapping object. If the request does not contain the needed parametrs, then return None. If the request contains a parameter with the name of the column + "_usage", snif for information on how to handle applying the index. Otherwise return two objects. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. """ record = parseIndexRequest( request, self.getId() ) if record.keys is None: return None term = self._convertDateTime( record.keys[0] ) # # Aggregate sets for each bucket separately, to avoid # large-small union penalties. # #until_only = IISet() #map( until_only.update, self._until_only.values( term ) ) # XXX use multi-union until_only = multiunion( self._until_only.values( term ) ) #since_only = IISet() #map( since_only.update, self._since_only.values( None, term ) ) # XXX use multi-union since_only = multiunion( self._since_only.values( None, term ) ) #until = IISet() #map( until.update, self._until.values( term ) ) # XXX use multi-union until = multiunion( self._until.values( term ) ) #since = IISet() #map( since.update, self._since.values( None, term ) ) # XXX use multi-union since = multiunion( self._since.values( None, term ) ) bounded = intersection( until, since ) # Merge from smallest to largest. #result = union( self._always, until_only ) result = union( bounded, until_only ) result = union( result, since_only ) #result = union( result, bounded ) result = union( result, self._always ) return result, ( self._since_field, self._until_field )
def intersectionResultSets(sets): """ perform intersection of ResultSets """ if not sets: return ResultSet(DocidList(), WordList()) docids = sets[0].getDocids() words = WordList(sets[0].getWords()) for set in sets[1:]: docids = intersection(docids, set.docids) words.extend(set.words) return ResultSet(docids, words)
def _eval(self,context): csq = self._classifySubqueries() if csq['empty']: return IISet() # empty result nsq = csq['lookup'] + csq['complex'] + csq['indexed'] notsq = csq['notQ'] if not nsq and not notsq: # an empty 'And' query return context._getObjectIds() if not nsq: nsq.append(notsq.pop()) r = None for q in nsq: r = intersection(r, q._eval(context)) for q in notsq: r = difference(r, q._query._eval(context)) return r
def count(self, context, facet, intersect=None): if IQueryResults.providedBy(intersect): intersect = IISet(intersect.keys()) sm = sitemanager_for(context) unique_name = '%s.%s' % (facet.name, self.name) cache_tools = queryUtility(ISetCacheTools, context=sm) invalidated = cache_tools.invalidated_records if not isinstance(invalidated, IISet): invalidated = IISet(invalidated) if isinstance(intersect, IISet): invalid = len(intersection(intersect, invalidated)) > 0 if unique_name in cache_tools.filter_setid_cache: setid = cache_tools.filter_setid_cache[unique_name] if setid in cache_tools.set_cache: if invalid: del(cache_tools.set_cache[setid]) del(cache_tools.filter_setid_cache[unique_name]) else: records = cache_tools.set_cache[setid] if intersect is None: return len(records) if isinstance(intersect, IISet): #optimal to cast smaller set to match IISet. return len(intersection(intersect, IISet(records))) return len(set(intersect) & records) #otherwise, at this point, no cached value, so query catalog... qf = self(unique_name) runner = AdvancedQueryRunner(context) result = runner(qf) setid = result.setid cache_tools.set_cache[setid] = result.frozen cache_tools.filter_setid_cache[unique_name] = setid if intersect is None: return len(result) if isinstance(intersect, IISet): return len(intersection(intersect, IISet(result.frozen))) return len(set(intersect) & result.frozen)
def query_index(self, record, resultset=None): index = self._index indexed = self._index_value for key in record.keys: if bool(key) is bool(indexed): # If we match the indexed value, check index return intersection(index, resultset) else: # Otherwise, remove from resultset or _unindex if resultset is None: return union(difference(self._unindex, index), IISet([])) else: return difference(resultset, index) return IISet()
def test_depth_limit_resultset(self): self._populateIndex() resultset = IISet([1, 2, 3, 4, 8, 16]) tests = [ # depth, expected result (1, [1, 8, 16]), (2, [1, 2, 8, 16]), (3, [1, 2, 3, 8, 16]), ] for depth, results in tests: res = self._index._apply_index(dict( path=dict(query='/', depth=depth)), resultset=resultset) combined = intersection(res[0], resultset) lst = list(combined) self.assertEqual(lst, results)
def _search(self, path, default_level=0): """ Perform the actual search. ``path`` a string representing a relative URL, or a part of a relative URL, or a tuple ``(path, level)``. In the first two cases, use ``default_level`` as the level for the search. ``default_level`` the level to use for non-tuple queries. ``level >= 0`` => match ``path`` only at the given level. ``level < 0`` => match ``path`` at *any* level """ if isinstance(path, str): level = default_level else: level = int(path[1]) path = path[0] if level < 0: # Search at every level, return the union of all results return multiunion( [self._search(path, level) for level in range(self._depth + 1)]) comps = filter(None, path.split('/')) if level + len(comps) - 1 > self._depth: # Our search is for a path longer than anything in the index return IISet() if len(comps) == 0: return IISet(self._unindex.keys()) results = None for i, comp in reversed(list(enumerate(comps))): tree = self._index.get(comp, None) if tree is None: return IISet() tree2 = tree.get(level + i, None) if tree2 is None: return IISet() results = intersection(results, tree2) return results
def _apply_index(self, request, resultset=None): record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index indexed = self._index_value for key in record.keys: if bool(key) is bool(indexed): # If we match the indexed value, check index return (intersection(index, resultset), (self.id,)) else: # Otherwise, remove from resultset or _unindex if resultset is None: return (union(difference(self._unindex, index), IISet([])), (self.id,)) else: return (difference(resultset, index), (self.id,)) return (IISet(), (self.id,))
def _search_index(self, cr, index_id, query, rs): cr.start_split(index_id) index_rs = None index = self.getIndex(index_id) limit_result = ILimitedResultIndex.providedBy(index) if IQueryIndex.providedBy(index): index_query = IndexQuery(query, index.id, index.query_options, index.operators, index.useOperator) if index_query.keys is not None: index_rs = index.query_index(index_query, rs) else: if limit_result: index_result = index._apply_index(query, rs) else: index_result = index._apply_index(query) # Parse (resultset, used_attributes) index return value. if index_result: index_rs, _ = index_result if not index_rs: # Short circuit if empty index result. rs = None else: # Provide detailed info about the pure intersection time. intersect_id = index_id + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets. if hasattr(rs, 'items') or hasattr(index_rs, 'items'): _, rs = weightedIntersection(rs, index_rs) else: rs = intersection(rs, index_rs) cr.stop_split(intersect_id) # Consider the time it takes to intersect the index result # with the total result set to be part of the index time. cr.stop_split(index_id, result=index_rs, limit=limit_result) return rs
def _apply_index(self, request, resultset=None): setlist = [] indices_used = [] for reltype in self.getIndexSourceNames(): query = request.get(reltype) if query is None: continue if isinstance(query, str): target = query else: target = IUUID(query) indices_used.append(reltype) index = self._index[reltype] s = index.get(target) if s is None: continue else: setlist.append(s) if not indices_used: return if len(setlist) == 1: return setlist[0], tuple(indices_used) # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) else: r = multiunion(setlist) if r is None: r = IISet() return r, tuple(indices_used)
def query_index(self, record, resultset=None): cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record, resultset) cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return cached else: return difference(resultset, cached) term = self._convertDateTime(record.keys[0]) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion([bounded, until_only, since_only, self._always]) if cache is not None: cache[cachekey] = result return result else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([since, since_only, until_only, until]) if cache is not None: cache[cachekey] = result return difference(resultset, result)
def _apply_index(self, request, cid=''): record = parseIndexRequest(request, self.getId(), self.query_options) if record.keys is None: return None catalog = getToolByName(self, 'portal_catalog') geoIndex = catalog._catalog.getIndex(self.geoindex_id) geoRequest = {} geoRequest[self.geoindex_id] = { 'query': record.keys, 'range': record.range} geo_response = geoIndex._apply_index(geoRequest, raw=True) paths = {} for item in geo_response: paths[int(item['id'])] = item['properties']['path'] rolesIndex = catalog._catalog.getIndex('allowedRolesAndUsers') user = _getAuthenticatedUser(self) perms_set = rolesIndex._apply_index( {'allowedRolesAndUsers': catalog._listAllowedRolesAndUsers(user)} )[0] r = intersection(perms_set, IISet(paths.keys())) if isinstance(r, int): r = IISet((r,)) if r is None: return IISet(), (self.getId(),) else: url_tool = getToolByName(self, 'portal_url') portal_path = url_tool.getPortalObject().getPhysicalPath() root = list(portal_path) def up(path): return '/'.join(root + path.strip('/').split('/')[:-1]) return union( r, IISet([catalog.getrid(up(paths[lid])) for lid in r]) ), (self.getId(),)
def keywords_filtered_by_context(context, index_name='Subject'): """valid subjects under the given context """ catalog_tool = api.portal.get_tool('portal_catalog') catalog = catalog_tool._catalog path_idx = catalog.indexes['path'] tags_idx = catalog.indexes[index_name] result = [] # query all oids of path - low level path_query = { 'path': { 'query': '/'.join(context.getPhysicalPath()), 'depth': -1, } } path_result, info = path_idx._apply_index(path_query) for tag in tags_idx.uniqueValues(): keyword_query = {index_name: tag} tags_result, info = tags_idx._apply_index(keyword_query) if intersection(path_result, tags_result): result.append(tag) return result
def group(self, seq): sortIndex = self._sortIndex; sortReverse = self._sortReverse ns = len(seq); ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet(); hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()); items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids); hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result m = OOBTree() keyFor = getattr(sortIndex, 'keyForDocument', None) # work around "nogopip" bug: it defines "keyForDocument" as an integer if not callable(keyFor): # this will fail, when the index neither defines a reasonable # "keyForDocument" nor "documentToKeyMap". In this case, # the index cannot be used for sorting. keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc] noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc); continue k = NaturalObjectCompare( k) l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items); items.reverse() for i in items: yield i if noValue: yield None, noValue
def search(self, query, sort_index=None, reverse=False, limit=None, merge=True): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) rs = None # result set indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 4 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets if hasattr(rs, 'items') or hasattr(r, 'items'): _, rs = weightedIntersection(rs, r) else: rs = intersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result # with the total result set to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if sort_index is None: sort_report_name = None else: if isinstance(sort_index, list): sort_name = '-'.join(i.getId() for i in sort_index) else: sort_name = sort_index.getId() if isinstance(reverse, list): reverse_name = '-'.join( 'desc' if r else 'asc' for r in reverse) else: reverse_name = 'desc' if reverse else 'asc' sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name if limit is not None: sort_report_name += '#limit-%s' % limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 4 this will result in an empty LazyCat ' 'to be returned.' % repr(cr.make_key(query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split(sort_report_name) result = self.sortResults( self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'items'): # having a 'items' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on#score') # sort it by score rs = rs.byValue(0) max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item data = self.data[key] klass = self._v_result_class schema_len = len(klass.__record_schema__) norm_score = int(100.0 * score / max) if schema_len == len(data) + 3: r = klass(tuple(data) + (key, score, norm_score)) else: r = klass(data) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = norm_score r = r.__of__(aq_parent(self)) return r sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on#score', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split(sort_report_name) result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) else: # Empty result set result = LazyCat([]) cr.stop() return result
def unindex_apply_index(self, request, cid='', type=type, res=None): record = parseIndexRequest(request, self.id, self.query_options) if record.keys==None: return None index = self._index r = None opr = None # experimental code for specifing the operator operator = record.get('operator',self.useOperator) if not operator in self.operators : raise RuntimeError,"operator not valid: %s" % escape(operator) # depending on the operator we use intersection or union if operator=="or": set_func = union else: set_func = intersection # Range parameter range_parm = record.get('range',None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min")>-1: opr_args.append("min") if range_parm.find("max")>-1: opr_args.append("max") if record.get('usage',None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args=opr[0], opr[1:] if opr=="range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo,hi) else: setlist = index.values(lo) # If we only use 1 key (default setting), intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) return result, (self.id,) if operator == 'or': r = multiunion(setlist) else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp r = res for s in setlist: r = intersection(r, s) else: # not a range search # Filter duplicates, and sort by length keys = set(record.keys) setlist = [] for k in keys: s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can't possibly get a bigger result continue # If intersection, we can't possibly get a smaller result return IISet(), (self.id,) elif isinstance(s, int): s = IISet((s,)) setlist.append(s) # If we only use 1 key (default setting), intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) return result, (self.id,) if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is faster # than creating a multiunion first. if res is not None and len(res) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(res, s)) r = multiunion(smalllist) else: r = multiunion(setlist) else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) r = res for s in setlist: r = intersection(r, s) if isinstance(r, int): r=IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,)
def query_index(self, record, resultset=None): """Search the index with the given IndexQuery object. If not `None`, the resultset argument indicates that the search result is relevant only on this set, i.e. everything outside resultset is of no importance. The index can use this information for optimizations. """ index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) operator = record.operator cachekey = None cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record) if cachekey is not None: cached = None if operator == 'or': cached = cache.get(cachekey, None) else: cached_setlist = cache.get(cachekey, None) if cached_setlist is not None: r = resultset for s in cached_setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a # smaller result if not r: break cached = r if cached is not None: if isinstance(cached, int): cached = IISet((cached, )) if not_parm: not_parm = list(map(self._convert, not_parm)) exclude = self._apply_not(not_parm, resultset) cached = difference(cached, exclude) return cached if not record.keys and not_parm: # convert into indexed format not_parm = list(map(self._convert, not_parm)) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = list(map(self._convert, record.keys)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = 'range' opr_args = [] if range_parm.find('min') > -1: opr_args.append('min') if range_parm.find('max') > -1: opr_args.append('max') if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == 'range': # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) r = multiunion(tmp) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp # 'r' is not invariant of resultset. Thus, we # have to remember 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: # Prevent None from being looked up. None doesn't # have a valid ordering definition compared to any # other object. BTrees 4.0+ will throw a TypeError # "object has default comparison". continue try: s = index.get(k, None) except TypeError: # key is not valid for this Btree so the value is None LOG.error( '%(context)s: query_index tried ' 'to look up key %(key)r from index %(index)r ' 'but key was of the wrong type.', dict( context=self.__class__.__name__, key=k, index=self.id, ) ) s = None # If None, try to bail early if s is None: if operator == 'or': # If union, we can possibly get a bigger result continue # If intersection, we can't possibly get a smaller result if cachekey is not None: # If operator is 'and', we have to cache a list of # IISet objects cache[cachekey] = [IISet()] return IISet() elif isinstance(s, int): s = IISet((s,)) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist'. But # this is maybe a performance killer. So we do not cache. # if cachekey is not None: # cache[cachekey] = multiunion(setlist) else: r = multiunion(setlist) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break if isinstance(r, int): r = IISet((r, )) if r is None: return IISet() if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r
def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the resultset, reverse sorting # order and limit it, then reverse the resultset again switched_reverse = False if b_size and b_start and b_start > rlen / 2: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size if merge and limit is None and ( rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. if reverse: result.sort(reverse=True) else: result.sort() sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: if reverse: result.sort(reverse=True) else: result.sort() if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def dateindex_apply_index(self, request, cid="", type=type, res=None): record = parseIndexRequest(request, self.id, self.query_options) if record.keys == None: return None keys = map(self._convert, record.keys) index = self._index r = None opr = None # experimental code for specifing the operator operator = record.get("operator", self.useOperator) if not operator in self.operators: raise RuntimeError, "operator not valid: %s" % operator # depending on the operator we use intersection or union if operator == "or": set_func = union else: set_func = intersection # range parameter range_arg = record.get("range", None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get("usage", None): # see if any usage params are sent to field opr = record.usage.lower().split(":") opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if "min" in opr_args: lo = min(keys) else: lo = None if "max" in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # for k, set in setlist: # if type(set) is IntType: # set = IISet((set,)) # r = set_func(r, set) # XXX: Use multiunion! r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set,)) else: # set can't be bigger than res set = intersection(set, res) r = set_func(r, set) if isinstance(r, int): r = IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,)
def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() _None = None _keyerror = KeyError result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the resultset, reverse sorting # order and limit it, then reverse the resultset again switched_reverse = False if b_size and b_start and b_start > rlen / 2: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size if merge and limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', _None) if keys is not _None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. if reverse: result.sort(reverse=True) else: result.sort() sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: if reverse: result.sort(reverse=True) else: result.sort() if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute precision before querying. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None keys = map( self._convert, record.keys ) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get( 'operator', self.useOperator ) if not operator in self.operators : raise RuntimeError("operator not valid: %s" % operator) # depending on the operator we use intersection or union if operator=="or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range',None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage',None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr=="range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo,hi) else: setlist = index.values(lo) r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set,)) else: # set can't be bigger than resultset set = intersection(set, resultset) r = set_func(r, set) if isinstance(r, int): r = IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,)
def search(self, query, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" rs = None # resultset # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 2.14 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets if hasattr(rs, 'items'): _, rs = weightedIntersection(rs, r) else: rs = intersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result with # the total resultset to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 2.14 this will result in an empty LazyCat ' 'to be returned.' % repr(cr.make_key(query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split('sort_on') result = self.sortResults(self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'items'): # having a 'items' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on') rs = rs.byValue(0) # sort it by score max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item r=self._v_result_class(self.data[key])\ .__of__(aq_parent(self)) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = int(100. * score / max) return r sequence, slen = self._limit_sequence( rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split('sort_on') result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) else: # Empty result set result = LazyCat([]) cr.stop() return result
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute precision before querying. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None keys = map(self._convert, record.keys) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % operator) # depending on the operator we use intersection or union if operator == "or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range', None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set, )) else: # set can't be bigger than resultset set = intersection(set, resultset) r = set_func(r, set) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) else: return r, (self.id, )
def _apply_index(self, request, resultset=None): """ Apply the index to query parameters given in 'request', which should be a mapping object. If the request does not contain the needed parameters, then return None. Otherwise return two objects. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. """ iid = self.id record = parseIndexRequest(request, iid, self.query_options) if record.keys is None: return None term = self._convertDateTime(record.keys[0]) REQUEST = aq_get(self, 'REQUEST', None) if REQUEST is not None: catalog = aq_parent(aq_parent(aq_inner(self))) if catalog is not None: key = self._cache_key(catalog) cache = REQUEST.get(key, None) tid = isinstance(term, int) and term / 10 or 'None' if resultset is None: cachekey = '_daterangeindex_%s_%s' % (iid, tid) else: cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid) if cache is None: cache = REQUEST[key] = RequestCache() else: cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return (cached, (self._since_field, self._until_field)) else: return (difference(resultset, cached), (self._since_field, self._until_field)) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) # Total result is bound by resultset if REQUEST is None: until = intersection(resultset, until) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion( [bounded, until_only, since_only, self._always]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (result, (self._since_field, self._until_field)) else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([until_only, since_only, until, since]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (difference(resultset, result), (self._since_field, self._until_field))
def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] r_append = result.append r_insert = result.insert if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse if merge and limit is None and ( rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. # Don't use this case while using limit, as we return results of # non-flattened intsets, and would have to merge/unflattened those # before limiting. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) r_append((k, intset, _self__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): r_append((k2, v2, _self__getitem__)) result = multisort(result, sort_spec) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result r_append((key, did, _self__getitem__)) if merge: result.sort(reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: r_append((full_key, did, _self__getitem__)) if merge: result = multisort(result, sort_spec) if merge: if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif first_reverse: # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] k_insert = keys.insert n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence elif not first_reverse: # Limit / sort results using N-Best algorithm in reverse (N-Worst?) keys = [] k_insert = keys.insert n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. If the request does not have a key which matches the "id" of the index instance, then None is returned. If the request *does* have a key which matches the "id" of the index instance, one of a few things can happen: - if the value is a blank string, None is returned (in order to support requests from web forms where you can't tell a blank string from empty). - if the value is a nonblank string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are "or" and "and". If None is not returned as a result of the abovementioned constraints, two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. FAQ answer: to search a Field Index for documents that have a blank string as their value, wrap the request value up in a tuple ala: request = {'id':('',)} """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index r = None opr = None # experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % escape(operator)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) return result, (self.id, ) if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) r = multiunion(tmp) else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) else: # not a range search # Filter duplicates setlist = [] for k in record.keys: s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can't possibly get a bigger result continue # If intersection, we can't possibly get a smaller result return IISet(), (self.id, ) elif isinstance(s, int): s = IISet((s, )) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) return result, (self.id, ) if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) else: r = multiunion(setlist) else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) r = resultset for s in setlist: r = intersection(r, s) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) else: return r, (self.id, )
def _select_rids(self, query): """Searches the table for matches, returning record ids. Returns a sequence of record ids, or None for all records. """ primary_key = [] params = 0 # The number of parameters specified primary_params = 0 # The number of primary params specified for position, column in self.col_info: value = query[position] if value is not None: params += 1 if column.primary: primary_params += 1 if primary_key is not None: primary_key.append(value) elif column.primary: # Didn't fully specify the primary key. # Can't search by primary key. primary_key = None if not params: # No query. Select all. return None # First strategy: try to satisfy the request by consulting # the primary key index. if primary_key: # The primary key is complete. The result set will have # either zero rows or one row. primary_key = tuple(primary_key) rid = self.primary_index.get(primary_key) if rid is None: return () # Possibly filter out the single item. if params > primary_params: cand = self.data[rid] for position, column in self.col_info: if query[position] is not None: if cand[position] != query[position]: # Not a match. return () return (rid, ) # Second strategy: try to satisfy the request by intersecting # indexes. rids = None iteration_filters = [] for position, column in self.col_info: value = query[position] if value is not None: index = self.indexes.get(column.name) if index is None: iteration_filters.append((position, value)) else: set = index.get(value) if set is None: # No rows satisfy this criterion. return () if rids is None: rids = set else: rids = intersection(rids, set) if not rids: # No rows satisfy all criteria. return () if rids is not None: rids = rids.keys() if not iteration_filters: # Indexes did all the work. No need to search each record. return rids # Fallback strategy: Eliminate items one by one. if rids is None: # Use the whole data set. candidates = self.data.values() else: # Use the specified records. candidates = [self.data[rid] for rid in rids] rids = [] append = rids.append for cand in candidates: for position, value in iteration_filters: if cand[position] != value: # Not a match. break else: # A match. append(cand[0]) return rids
def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] r_append = result.append r_insert = result.insert if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse if merge and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) r_append((k, intset, _self__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): r_append((k2, v2, _self__getitem__)) result = multisort(result, sort_spec) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result r_append((key, did, _self__getitem__)) if merge: result.sort(reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: r_append((full_key, did, _self__getitem__)) if merge: result = multisort(result, sort_spec) if merge: if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif first_reverse: # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] k_insert = keys.insert n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence elif not first_reverse: # Limit / sort results using N-Best algorithm in reverse (N-Worst?) keys = [] k_insert = keys.insert n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path,level). level >= 0 starts searching at the given level level < 0 not implemented yet """ if isinstance(path, basestring): startlevel = default_level else: startlevel = int(path[1]) path = path[0] absolute_path = isinstance(path, basestring) and path.startswith('/') comps = filter(None, path.split('/')) orig_comps = [''] + comps[:] if depth > 0: raise ValueError, "Can't do depth searches anymore" if not comps: comps = ['dmd'] startlevel = 1 else: if comps[0] == getCSEConf().get('virtualroot', '').replace('/', ''): comps = comps[1:] if comps[0] == 'zport': comps = comps[1:] if comps[0] != 'dmd': raise ValueError, "Depth searches must start with 'dmd'" startlevel = len(comps) if len(comps) == 0: if depth == -1 and not navtree: return IISet(self._unindex.keys()) # Make sure that we get depth = 1 if in navtree mode # unless specified otherwise orig_depth = depth if depth == -1: depth = 0 or navtree # Optimized navtree starting with absolute path if absolute_path and navtree and depth == 1 and default_level==0: set_list = [] # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: set_list.append(self._index_parents[parent_path]) except KeyError: pass return multiunion(set_list) # Optimized breadcrumbs elif absolute_path and navtree and depth == 0 and default_level==0: item_list = IISet() # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: item_list.insert(self._index_items[parent_path]) except KeyError: pass return item_list # Specific object search elif absolute_path and orig_depth == 0 and default_level == 0: try: return IISet([self._index_items[path]]) except KeyError: return IISet() # Single depth search elif absolute_path and orig_depth == 1 and default_level == 0: # only get objects contained in requested folder try: return self._index_parents[path] except KeyError: return IISet() # Sitemaps, relative paths, and depth queries elif startlevel >= 0: pathset = None # Same as pathindex navset = None # For collecting siblings along the way depthset = None # For limiting depth if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(startlevel): navset = self._index[None][startlevel] for level in range(startlevel, startlevel+len(comps)): if level <= len(comps): comp = "/".join(comps[:level]) if (not self._index.has_key(comp) or not self._index[comp].has_key(level)): # Navtree is inverse, keep going even for # nonexisting paths if navtree: pathset = IISet() else: return IISet() else: return self._index[comp][level] if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(level+depth): navset = union(navset, intersection(pathset, self._index[None][level+depth])) if level-startlevel >= len(comps) or navtree: if (self._index.has_key(None) and self._index[None].has_key(level)): depthset = union(depthset, intersection(pathset, self._index[None][level])) if navtree: return union(depthset, navset) or IISet() elif depth: return depthset or IISet() else: return pathset or IISet() else: results = IISet() for level in range(0,self._depth + 1): ids = None error = 0 for cn in range(0,len(comps)): comp = comps[cn] try: ids = intersection(ids,self._index[comp][level+cn]) except KeyError: error = 1 if error==0: results = union(results,ids) return results
from BTrees.IIBTree import IISet, union, intersection, difference def make_choice(data, per): data_len = len(data) return [choice(data) for i in range(0, data_len * float(per) / 100.0)] for max in (500, 2500, 5000, 10000, 25000, 50000, 100000): data = range(max) for p1, p2 in ((25, 25), (25, 50), (25, 75), (25, 100), (50, 50), (50, 75), (50, 100), (75, 75), (75, 100), (100, 100)): d1 = IISet(make_choice(data, p1)) d2 = IISet(make_choice(data, p2)) ts = time.time() union(d1, d2) tu = time.time() - ts ts = time.time() intersection(d1, d2) ti = time.time() - ts ts = time.time() difference(d1, d2) td = time.time() - ts print '%6d %3d:%3d %6.6f %6.6f %6.6f' % (max, p1, p2, tu, ti, td)