def test_heavy_end(self): bigsize = BIGSETSIZE smallsize = SMALLSETSIZE small = IISet(xrange(bigsize - smallsize, bigsize)) large = IITreeSet(xrange(smallsize)) self.timing(small, large, 'Intersection small set high values + small treeset') self.timing(large, small, 'Intersection small treeset + small set high values') small = IISet(xrange(bigsize - smallsize, bigsize)) large = IITreeSet(xrange(bigsize)) self.timing(small, large, 'Intersection small set high values + large treeset') self.timing(large, small, 'Intersection large treeset + small set high values') small = IISet(xrange(bigsize - smallsize, bigsize)) large = IISet(xrange(bigsize)) self.timing(small, large, 'Intersection small set high values + large set') self.timing(large, small, '\nIntersection large set + small set high values')
def testLargerInputs(self): from random import randint MAXSIZE = 200 MAXVAL = 400 for i in range(3): n = randint(0, MAXSIZE) Akeys = [randint(1, MAXVAL) for j in range(n)] As = [makeset(Akeys) for makeset in self.builders] Akeys = IISet(Akeys) n = randint(0, MAXSIZE) Bkeys = [randint(1, MAXVAL) for j in range(n)] Bs = [makeset(Bkeys) for makeset in self.builders] Bkeys = IISet(Bkeys) for op, simulator in ((self.union, self._union), (self.intersection, self._intersection), (self.difference, self._difference)): for A in As: for B in Bs: got = op(A, B) want = simulator(Akeys, Bkeys) self.assertEqual(list(got), want, (A, B, Akeys, Bkeys, list(got), want))
def register(self, hrefs, referer, timestamp): """Add or update link presence information. If a link has not been checked since the provided timestamp, it will be added to the queue (or if it is not in the database). """ referer = self.index.get(referer) or self.store(referer) registry = getUtility(IRegistry, context=self.aq_parent) try: settings = registry.forInterface(ISettings) except KeyError as exc: logger.warn(exc) return limit = settings.referers for href in hrefs: if self.should_ignore(href, settings.ignore_list): continue # If the hyperlink is not already in the work queue, # compare the provided timestamp to our database to see if # we need to check its validity. Note that internal links # are excempt if we're not using the publisher. index = self.index.get(href) entry = self.checked.get(-1 if index is None else index) if index not in self.queue: if entry is None or entry[0] < timestamp: if settings.use_publisher or not href.startswith('/'): index = self.enqueue(href) elif href not in self.index: index = self.store(href) assert index is not None if entry is None: self.checked[index] = None, None, IISet((referer, )) else: # If the provided paths are a subset of the already # seen paths, and if there is no new referer, we don't # issue an update. referers = entry[2] if referer not in referers and len(referers) <= limit: referers.add(referer)
def test_depth_limit_resultset(self): self._populateIndex() resultset = IISet([1, 2, 3, 4, 8, 16]) tests = [ # depth, expected result (1, [1, 8, 16]), (2, [1, 2, 8, 16]), (3, [1, 2, 3, 8, 16]), ] for depth, results in tests: res = self._index._apply_index( dict(path=dict(query='/', depth=depth)), resultset=resultset) combined = intersection(res[0], resultset) lst = list(combined) self.assertEqual(lst, results)
def _apply_index(self, request, cid=''): """Apply the index to the search parameters given in request""" record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None result = None fallback = self.fallback if hasattr(record, 'fallback'): fallback = bool(record.fallback) for language in record.keys: rows = self._search(language, fallback) result = ii_union(result, rows) return (result or IISet()), ('Language', )
def _search(self, language, fallback=True): main, sub = splitLanguage(language) if main not in self._index: return None if fallback: # Search in sorted order, specific sub tag first, None second subs = list(self._index[main].keys()) subs.sort() if sub in subs: subs.remove(sub) subs.insert(0, sub) else: subs = [sub] result = OOSet() for sublanguage in subs: result = oo_union(result, self._index[main][sublanguage]) return IISet(entry.docid for entry in result)
def languageindex_apply_index(self, request, cid='', res=None): """Apply the index to the search parameters given in request""" record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None result = None fallback = self.fallback if hasattr(record, 'fallback'): fallback = bool(record.fallback) # TODO: This could be optimized to avoid a loop per language # As we most often get a language code and '' for language neutral this # could be beneficial. If the site has no language neutral content, the # first check "main not in self._index" will return None. The union of # None with the resultset is a cheap call, so we don't care right now. for language in record.keys: rows = self._search(language, fallback, res=res) result = ii_union(result, rows) return (result or IISet()), ('Language', )
def query_index(self, record, resultset=None): """See IPluggableIndex. o Unpacks record from catalog and map onto '_search'. """ level = record.get('level', 0) operator = record.operator # depending on the operator we use intersection or union if operator == 'or': set_func = union else: set_func = intersection res = None for k in record.keys: rows = self._search(k, level) res = set_func(res, rows) if res: return res return IISet()
def update(self, href, status): """Update link status.""" now = datetime.datetime.now() timestamp = int(time.mktime(now.timetuple())) index = self.index.get(href) if index is None: return entry = self.checked.get(-1 if index is None else index) if entry is None: self.checked[index] = timestamp, status, IISet() # If the status changed, we update the entry. elif status != entry[1] or not entry[0]: # If the status was previously good, then we clear the # status. What this means is that we'll wait for the next # check to declare a bad status (it might be temporary). if entry[1] == 200: status = None self.checked[index] = timestamp, status, entry[2]
def _apply_index(self, request, resultset=None): """ hook for (Z)Catalog 'request' -- mapping type (usually {"path": "..." } additionaly a parameter "path_level" might be passed to specify the level (see search()) """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys == None: return None level = record.get("level", 0) operator = record.get('operator', self.useOperator).lower() depth = getattr(record, 'depth', -1) # use getattr to get 0 value navtree = record.get('navtree', 0) navtree_start = record.get('navtree_start', 0) # depending on the operator we use intersection of union if operator == "or": set_func = union else: set_func = intersection result = None for k in record.keys: rows = self.search(k, level, depth, navtree, navtree_start, resultset=resultset) result = set_func(result, rows) if result: return (result, (self.id, )) else: return (IISet(), (self.id, ))
def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse = self._inverseLex try: insert = inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid = inverse.keys()[-1] + 1 else: self._inverseLex = IOBTree() wid = 1 inverse[wid] = word else: # we have a "new" IOBTree object wid = randid() while not inverse.insert(wid, word): wid = randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid
def query_index(self, record, resultset=None): """Search the index with the given IndexQuery object. If not `None`, the resultset argument indicates that the search result is relevant only on this set, i.e. everything outside resultset is of no importance. The index can use this information for optimizations. """ index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) operator = record.operator cachekey = None cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record) if cachekey is not None: cached = None if operator == 'or': cached = cache.get(cachekey, None) else: cached_setlist = cache.get(cachekey, None) if cached_setlist is not None: r = resultset for s in cached_setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a # smaller result if not r: break cached = r if cached is not None: if isinstance(cached, int): cached = IISet((cached, )) if not_parm: not_parm = list(map(self._convert, not_parm)) exclude = self._apply_not(not_parm, resultset) cached = difference(cached, exclude) return cached if not record.keys and not_parm: # convert into indexed format not_parm = list(map(self._convert, not_parm)) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = list(map(self._convert, record.keys)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = 'range' opr_args = [] if range_parm.find('min') > -1: opr_args.append('min') if range_parm.find('max') > -1: opr_args.append('max') if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == 'range': # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) r = multiunion(tmp) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp # 'r' is not invariant of resultset. Thus, we # have to remember 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: # Prevent None from being looked up. None doesn't # have a valid ordering definition compared to any # other object. BTrees 4.0+ will throw a TypeError # "object has default comparison". continue try: s = index.get(k, None) except TypeError: # key is not valid for this Btree so the value is None LOG.error( '%(context)s: query_index tried ' 'to look up key %(key)r from index %(index)r ' 'but key was of the wrong type.', dict( context=self.__class__.__name__, key=k, index=self.id, ) ) s = None # If None, try to bail early if s is None: if operator == 'or': # If union, we can possibly get a bigger result continue # If intersection, we can't possibly get a smaller result if cachekey is not None: # If operator is 'and', we have to cache a list of # IISet objects cache[cachekey] = [IISet()] return IISet() elif isinstance(s, int): s = IISet((s,)) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist'. But # this is maybe a performance killer. So we do not cache. # if cachekey is not None: # cache[cachekey] = multiunion(setlist) else: r = multiunion(setlist) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break if isinstance(r, int): r = IISet((r, )) if r is None: return IISet() if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r
def query_index(self, record, resultset=None): """Search the index with the given IndexQuery object. If the query has a key which matches the 'id' of the index instance, one of a few things can happen: - if the value is a string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are 'or' and 'and'. """ index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) operator = record.operator cachekey = None cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record) if cachekey is not None: cached = None if operator == 'or': cached = cache.get(cachekey, None) else: cached_setlist = cache.get(cachekey, None) if cached_setlist is not None: r = resultset for s in cached_setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a # smaller result if not r: break cached = r if cached is not None: if isinstance(cached, int): cached = IISet((cached, )) if not_parm: not_parm = list(map(self._convert, not_parm)) exclude = self._apply_not(not_parm, resultset) cached = difference(cached, exclude) return cached if not record.keys and not_parm: # convert into indexed format not_parm = list(map(self._convert, not_parm)) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = list(map(self._convert, record.keys)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) r = multiunion(tmp) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp # 'r' is not invariant of resultset. Thus, we # have to remember 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: # Prevent None from being looked up. None doesn't # have a valid ordering definition compared to any # other object. BTrees 4.0+ will throw a TypeError # "object has default comparison". continue s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can possibly get a bigger result continue # If intersection, we can't possibly get a smaller result if cachekey is not None: # If operator is 'and', we have to cache a list of # IISet objects cache[cachekey] = [IISet()] return IISet() elif isinstance(s, int): s = IISet((s, )) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist'. But # this is maybe a performance killer. So we do not cache. # if cachekey is not None: # cache[cachekey] = multiunion(setlist) else: r = multiunion(setlist) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break if isinstance(r, int): r = IISet((r, )) if r is None: return IISet() if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r
def test_search_inputresult(self): index = self._makeOne() obj = Dummy(1, True) index._index_object(obj.id, obj, attr='truth') obj = Dummy(2, False) index._index_object(obj.id, obj, attr='truth') obj = Dummy(3, True) index._index_object(obj.id, obj, attr='truth') # The less common value is indexed self.assertEqual(index._index_value, 0) res, idx = index._apply_index({'truth': True}, resultset=IISet([])) self.assertEqual(idx, ('truth', )) self.assertEqual(list(res), []) res, idx = index._apply_index({'truth': False}, resultset=IISet([])) self.assertEqual(idx, ('truth', )) self.assertEqual(list(res), []) res, idx = index._apply_index({'truth': True}, resultset=IISet([1])) self.assertEqual(list(res), [1]) res, idx = index._apply_index({'truth': False}, resultset=IISet([1])) self.assertEqual(list(res), []) res, idx = index._apply_index({'truth': True}, resultset=IISet([2])) self.assertEqual(list(res), []) res, idx = index._apply_index({'truth': False}, resultset=IISet([2])) self.assertEqual(list(res), [2]) res, idx = index._apply_index({'truth': True}, resultset=IISet([1, 2])) self.assertEqual(list(res), [1]) res, idx = index._apply_index({'truth': False}, resultset=IISet([1, 2])) self.assertEqual(list(res), [2]) res, idx = index._apply_index({'truth': True}, resultset=IISet([1, 3])) self.assertEqual(list(res), [1, 3]) res, idx = index._apply_index({'truth': False}, resultset=IISet([1, 3])) self.assertEqual(list(res), []) res, idx = index._apply_index({'truth': True}, resultset=IISet([1, 2, 3])) self.assertEqual(list(res), [1, 3]) res, idx = index._apply_index({'truth': False}, resultset=IISet([1, 2, 3])) self.assertEqual(list(res), [2]) res, idx = index._apply_index({'truth': True}, resultset=IISet([1, 2, 99])) self.assertEqual(list(res), [1]) res, idx = index._apply_index({'truth': False}, resultset=IISet([1, 2, 99])) self.assertEqual(list(res), [2])
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute precision before querying. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None keys = map(self._convert, record.keys) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % operator) # depending on the operator we use intersection or union if operator == "or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range', None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set, )) else: # set can't be bigger than resultset set = intersection(set, resultset) r = set_func(r, set) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) else: return r, (self.id, )
def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] r_append = result.append r_insert = result.insert if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse if merge and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) r_append((k, intset, _self__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): r_append((k2, v2, _self__getitem__)) result = multisort(result, sort_spec) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result r_append((key, did, _self__getitem__)) if merge: result.sort(reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: r_append((full_key, did, _self__getitem__)) if merge: result = multisort(result, sort_spec) if merge: if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif first_reverse: # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] k_insert = keys.insert n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence elif not first_reverse: # Limit / sort results using N-Best algorithm in reverse (N-Worst?) keys = [] k_insert = keys.insert n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def checkCatalog(path, indexes): """ perform some consistency checks on a ZCatalog instance""" root = Zope2.app() try: catalog = root.unrestrictedTraverse(path) except AttributeError: print 'Error: catalog object not found' sys.exit(1) # get Catalog instance _cat = catalog._catalog # check Catalog internal BTrees l_data = list(_cat.data.keys()) l_data.sort() l_uids = list(_cat.uids.values()) l_uids.sort() l_paths = list(_cat.data.keys()) l_paths.sort() print "Checking catalog internal BTrees" print "\tINFO: Mapping data: %d entries" % len(l_data) print "\tINFO: Mapping uids: %d entries" % len(l_uids) print "\tINFO: Mapping paths: %d entries" % len(l_paths) if l_data == l_uids: print "\tOK: Mapping data equals Mapping uids" else: print "\tERR: Mapping data does not equal Mapping uids" if l_data == l_paths: print "\tOK: Mapping data equals Maaping paths" else: print "\tERR: Mapping data does not equal Maaping paths" # check BTrees of indexes for id, idx in _cat.indexes.items(): if indexes and not idx.meta_type in indexes: continue print "Checking index '%s' (type: %s)" % (id, idx.meta_type) if idx.meta_type in ['FieldIndex', 'KeywordIndex']: # check forward entries RIDS = IISet() for key, rids in idx._index.items(): if isinstance(rids, IntType): RIDS.insert(rids) else: map(RIDS.insert, rids.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) elif idx.meta_type in ['PathIndex']: RIDS = IISet() for rids in map(None, idx._index.values()): map(RIDS.insert, rids.values()[0]) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) if idx.meta_type in ['FieldIndex', 'KeywordIndex', 'PathIndex']: # check backward entries RIDS = IISet(idx._unindex.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with backward entries' print '\tERR: too much backward entries:', diff else: print '\tOK: Backward entries (%d entries)' % (len(RIDS))
def search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path,level). level >= 0 starts searching at the given level level < 0 not implemented yet """ if isinstance(path, basestring): startlevel = default_level else: startlevel = int(path[1]) path = path[0] absolute_path = isinstance(path, basestring) and path.startswith('/') comps = filter(None, path.split('/')) orig_comps = [''] + comps[:] if depth > 0: raise ValueError("Can't do depth searches anymore") if not comps: comps = ['dmd'] startlevel = 1 else: if comps[0] == getCSEConf().get('virtualroot', '').replace('/', ''): comps = comps[1:] if comps[0] == 'zport': comps = comps[1:] if comps[0] != 'dmd': raise ValueError("Depth searches must start with 'dmd'") startlevel = len(comps) if len(comps) == 0: if depth == -1 and not navtree: return IISet(self._unindex.keys()) # Make sure that we get depth = 1 if in navtree mode # unless specified otherwise orig_depth = depth if depth == -1: depth = 0 or navtree # Optimized navtree starting with absolute path if absolute_path and navtree and depth == 1 and default_level == 0: set_list = [] # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: set_list.append(self._index_parents[parent_path]) except KeyError: pass return multiunion(set_list) # Optimized breadcrumbs elif absolute_path and navtree and depth == 0 and default_level == 0: item_list = IISet() # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: item_list.insert(self._index_items[parent_path]) except KeyError: pass return item_list # Specific object search elif absolute_path and orig_depth == 0 and default_level == 0: try: return IISet([self._index_items[path]]) except KeyError: return IISet() # Single depth search elif absolute_path and orig_depth == 1 and default_level == 0: # only get objects contained in requested folder try: return self._index_parents[path] except KeyError: return IISet() # Sitemaps, relative paths, and depth queries elif startlevel >= 0: pathset = None # Same as pathindex navset = None # For collecting siblings along the way depthset = None # For limiting depth if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(startlevel): navset = self._index[None][startlevel] for level in range(startlevel, startlevel + len(comps)): if level <= len(comps): comp = "/".join(comps[:level]) if (not self._index.has_key(comp) or not self._index[comp].has_key(level)): # Navtree is inverse, keep going even for # nonexisting paths if navtree: pathset = IISet() else: return IISet() else: return self._index[comp][level] if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(level+depth): navset = union( navset, intersection(pathset, self._index[None][level + depth])) if level - startlevel >= len(comps) or navtree: if (self._index.has_key(None) and self._index[None].has_key(level)): depthset = union( depthset, intersection(pathset, self._index[None][level])) if navtree: return union(depthset, navset) or IISet() elif depth: return depthset or IISet() else: return pathset or IISet() else: results = IISet() for level in range(0, self._depth + 1): ids = None error = 0 for cn in range(0, len(comps)): comp = comps[cn] try: ids = intersection(ids, self._index[comp][level + cn]) except KeyError: error = 1 if error == 0: results = union(results, ids) return results
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. If the request does not have a key which matches the "id" of the index instance, then None is returned. If the request *does* have a key which matches the "id" of the index instance, one of a few things can happen: - if the value is a blank string, None is returned (in order to support requests from web forms where you can't tell a blank string from empty). - if the value is a nonblank string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are "or" and "and". If None is not returned as a result of the abovementioned constraints, two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. FAQ answer: to search a Field Index for documents that have a blank string as their value, wrap the request value up in a tuple ala: request = {'id':('',)} """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) if not record.keys and not_parm: # convert into indexed format not_parm = map(self._convert, not_parm) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = map(self._convert, record.keys) # experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % escape(operator)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id, ) if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) r = multiunion(tmp) else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) else: # not a range search # Filter duplicates setlist = [] for k in record.keys: s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can't possibly get a bigger result continue # If intersection, we can't possibly get a smaller result return IISet(), (self.id, ) elif isinstance(s, int): s = IISet((s, )) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id, ) if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) else: r = multiunion(setlist) else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) r = resultset for s in setlist: r = intersection(r, s) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r, (self.id, )
def count(self, brains, sequence=None): """ Intersect results """ res = {} # by checking for facet_counts we assume this is a SolrResponse # from collective.solr if hasattr(brains, 'facet_counts'): facet_fields = brains.facet_counts.get('facet_fields') if facet_fields: index_id = self.data.get('index') facet_field = facet_fields.get(index_id, {}) for value, num in facet_field.items(): normalized_value = atdx_normalize(value) if isinstance(value, unicode): res[value] = num elif isinstance(normalized_value, unicode): res[normalized_value] = num else: unicode_value = value.decode('utf-8') res[unicode_value] = num else: # no facet counts were returned. we exit anyway because # zcatalog methods throw an error on solr responses return res res[""] = res['all'] = len(brains) return res else: # this is handled by the zcatalog. see below pass if not sequence: sequence = [key for key, value in self.vocabulary()] if not sequence: return res index_id = self.data.get('index') if not index_id: return res ctool = getToolByName(self.context, 'portal_catalog') index = ctool._catalog.getIndex(index_id) ctool = queryUtility(IFacetedCatalog) if not ctool: return res if isinstance(brains, LazyMap): values = brains._seq # 75384 seq might be a pair of tuples instead of ints # if you upgrade to ZCatalog 3 if isinstance(values[0], tuple): values = [v[1] for v in values] brains = IISet(values) else: brains = IISet(brain.getRID() for brain in brains) res[""] = res['all'] = len(brains) for value in sequence: if not value: res[value] = len(brains) continue normalized_value = atdx_normalize(value) rset = ctool.apply_index(self.context, index, normalized_value)[0] rset = IISet(rset) rset = weightedIntersection(brains, rset)[1] if isinstance(value, unicode): res[value] = len(rset) elif isinstance(normalized_value, unicode): res[normalized_value] = len(rset) else: unicode_value = value.decode('utf-8') res[unicode_value] = len(rset) return res
def _makeOne(self): from BTrees.IIBTree import IISet return IISet()
def dateindex_apply_index( self, request, cid='', type=type, res=None): record = parseIndexRequest( request, self.id, self.query_options ) if record.keys == None: return None keys = map( self._convert, record.keys ) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get( 'operator', self.useOperator ) if not operator in self.operators : raise RuntimeError, "operator not valid: %s" % operator # depending on the operator we use intersection or union if operator=="or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range',None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage',None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr=="range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo,hi) else: setlist = index.values(lo) #for k, set in setlist: #if type(set) is IntType: #set = IISet((set,)) #r = set_func(r, set) # XXX: Use multiunion! r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set,)) else: # set can't be bigger than res set = intersection(set, res) r = set_func(r, set) if isinstance(r, int): r = IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,)
def _apply_index(self, request, cid='', type=type): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. If the request does not have a key which matches the "id" of the index instance, then None is returned. If the request *does* have a key which matches the "id" of the index instance, one of a few things can happen: - if the value is a blank string, None is returned (in order to support requests from web forms where you can't tell a blank string from empty). - if the value is a nonblank string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. If the request contains a parameter with the name of the column + '_usage', it is sniffed for information on how to handle applying the index. If the request contains a parameter with the name of the column = '_operator' this overrides the default method ('or') to combine search results. Valid values are "or" and "and". If None is not returned as a result of the abovementioned constraints, two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. FAQ answer: to search a Field Index for documents that have a blank string as their value, wrap the request value up in a tuple ala: request = {'id':('',)} """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys==None: return None index = self._index r = None opr = None # experimental code for specifing the operator operator = record.get('operator',self.useOperator) if not operator in self.operators : raise RuntimeError,"operator not valid: %s" % escape(operator) # depending on the operator we use intersection or union if operator=="or": set_func = union else: set_func = intersection # Range parameter range_parm = record.get('range',None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min")>-1: opr_args.append("min") if range_parm.find("max")>-1: opr_args.append("max") if record.get('usage',None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args=opr[0], opr[1:] if opr=="range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.items(lo,hi) else: setlist = index.items(lo) for k, set in setlist: if isinstance(set, int): set = IISet((set,)) r = set_func(r, set) else: # not a range search for key in record.keys: set=index.get(key, None) if set is None: set = IISet(()) elif isinstance(set, int): set = IISet((set,)) r = set_func(r, set) if isinstance(r, int): r=IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,)
def _loadAux(self, index, term): '''the docId list for *term*.''' dl = index.get(term) if dl is None: return IISet() if isinstance(dl, int): return IISet((dl, )) return dl
def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() _None = None _keyerror = KeyError result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the resultset, reverse sorting # order and limit it, then reverse the resultset again switched_reverse = False if b_size and b_start and b_start > rlen / 2: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = b_end - b_start limit = b_start + b_size if merge and limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', _None) if keys is not _None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. if reverse: result.sort(reverse=True) else: result.sort() sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: if reverse: result.sort(reverse=True) else: result.sort() if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def index_object(self, documentId, obj, threshold=None): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. o Repeat by recurdef - a RFC2445 reccurence definition string """ returnStatus = 0 try: date_attr = getattr(obj, self.id) if safe_callable(date_attr): date_attr = date_attr() except AttributeError: return returnStatus recurdef = getattr(obj, self.attr_recurdef, None) if safe_callable(recurdef): recurdef = recurdef() if not recurdef: dates = [pydt(date_attr)] else: until = getattr(obj, self.attr_until, None) if safe_callable(until): until = until() dates = recurrence_sequence_ical(date_attr, recrule=recurdef, until=until) newvalues = IISet(map(dt2int, dates)) oldvalues = self._unindex.get(documentId, _marker) if oldvalues is not _marker: oldvalues = IISet(oldvalues) if oldvalues is not _marker and newvalues is not _marker\ and not difference(newvalues, oldvalues)\ and not difference(oldvalues, newvalues): # difference is calculated relative to first argument, so we have to # use it twice here return returnStatus if oldvalues is not _marker: for oldvalue in oldvalues: self.removeForwardIndexEntry(oldvalue, documentId) if newvalues is _marker: try: del self._unindex[documentId] except ConflictError: raise except: LOG.error("Should not happen: oldvalues was there," " now it's not, for document with id %s" % documentId) if newvalues is not _marker: inserted = False for value in newvalues: self.insertForwardIndexEntry(value, documentId) inserted = True if inserted: # store tuple values in reverse index entries for sorting self._unindex[documentId] = tuple(newvalues) returnStatus = 1 return returnStatus
def extendedpathindex_search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0, tmpres=None): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path,level). default_level specifies the level to use when no more specific level has been passed in with the path. level >= 0 starts searching at the given level level < 0 finds matches at *any* level depth let's you limit the results to items at most depth levels deeper than the matched path. depth == 0 means no subitems are included at all, with depth == 1 only direct children are included, etc. depth == -1, the default, returns all children at any depth. navtree is treated as a boolean; if it evaluates to True, not only the query match is returned, but also each container in the path. If depth is greater than 0, also all siblings of those containers, as well as the siblings of the match are included as well, plus *all* documents at the starting level. navtree_start limits what containers are included in a navtree search. If greater than 0, only containers (and possibly their siblings) at that level and up will be included in the resultset. """ if isinstance(path, basestring): level = default_level else: level = int(path[1]) path = path[0] if level < 0: # Search at every level, return the union of all results return multiunion([ self.search(path, level, depth, navtree, navtree_start) for level in xrange(self._depth + 1) ]) comps = filter(None, path.split('/')) if navtree and depth == -1: # Navtrees don't do recursive depth = 1 # # Optimisations # pathlength = level + len(comps) - 1 if navtree and navtree_start > min(pathlength + depth, self._depth): # This navtree_start excludes all items that match the depth return IISet() if pathlength > self._depth: # Our search is for a path longer than anything in the index return IISet() if level == 0 and depth in (0, 1): # We have easy indexes for absolute paths where # we are looking for depth 0 or 1 result sets if navtree: # Optimized absolute path navtree and breadcrumbs cases result = [] add = lambda x: x is not None and result.append(x) if depth == 1: # Navtree case, all sibling elements along the path convert = multiunion index = self._index_parents else: # Breadcrumbs case, all direct elements along the path convert = IISet index = self._index_items # Collect all results along the path for i in range(len(comps), navtree_start - 1, -1): parent_path = '/' + '/'.join(comps[:i]) add(index.get(parent_path)) return convert(result) if not path.startswith('/'): path = '/' + path if depth == 0: # Specific object search res = self._index_items.get(path) return res and IISet([res]) or IISet() else: # Single depth search return self._index_parents.get(path, IISet()) # Avoid using the root set # as it is common for all objects anyway and add overhead # There is an assumption about all indexed values having the # same common base path if level == 0: indexpath = list(filter(None, self.getPhysicalPath())) minlength = min(len(indexpath), len(comps)) # Truncate path to first different element for i in xrange(minlength): if indexpath[i] != comps[i]: break level += 1 comps = comps[level:] if not comps and depth == -1: # Recursive search for everything return IISet(self._unindex) # # Core application of the indexes # pathset = tmpres # Same as pathindex depthset = None # For limiting depth if navtree and depth > 0: # Include the elements up to the matching path depthset = multiunion([ self._index.get(None, {}).get(i, IISet()) for i in range(min(navtree_start, level), max(navtree_start, level) + 1) ]) indexedcomps = enumerate(comps) if not navtree: # Optimize relative-path searches by starting with the # presumed smaller sets at the end of the path first # We can't do this for the navtree case because it needs # the bigger rootset to include siblings along the way. indexedcomps = list(indexedcomps) indexedcomps.reverse() for i, comp in indexedcomps: # Find all paths that have comp at the given level res = self._index.get(comp, {}).get(i + level) if res is None: # Non-existing path; navtree is inverse, keep going pathset = IISet() if not navtree: return pathset pathset = intersection(pathset, res) if navtree and i + level >= navtree_start: depthset = union( depthset, intersection(pathset, self._index.get(None, {}).get(i + level))) if depth >= 0: # Limit results to those that terminate within depth levels start = len(comps) - 1 if navtree: start = max(start, (navtree_start - level)) depthset = multiunion( filter(None, [depthset] + [ intersection(pathset, self._index.get(None, {}).get(i + level)) for i in xrange(start, start + depth + 1) ])) if navtree or depth >= 0: return depthset return pathset
def __call__(self, context, **query): # return super(FacetedCatalog, self).__call__(context, **query) ctool = getToolByName(context, 'portal_faceted', None) if ctool: search = ctool.search else: logger.debug('portal_faceted not present, using portal_catalog') ctool = getToolByName(context, 'portal_catalog') search = ctool.searchResults # Also get query from Topic buildQuery = getattr(context, 'buildQuery', None) newquery = buildQuery and buildQuery() or {} formquery = None # Get query from Collection if HAS_PAT: if PACI.ICollection.providedBy(context): infos = ICollection_behavior(context) sort_order = ('descending' if infos.sort_reversed else 'ascending') sort_on = infos.sort_on formquery = infos.query if ICollection.providedBy(context): getRawQuery = getattr(context, 'getRawQuery', lambda: []) formquery = getRawQuery() getSortOn = getattr(context, 'getSort_on', lambda: None) sort_on = getSortOn() if sort_on: getSortReversed = getattr(context, 'getSort_reversed', lambda: None) sort_order = getSortReversed() if sort_order: sort_order = 'descending' else: sort_order = 'ascending' else: sort_order = None if formquery is not None: newquery = parseFormquery(context, formquery, sort_on, sort_order) if not isinstance(newquery, dict): newquery = {} # Avoid mixing sorting params from faceted and collection if 'sort_on' not in query: query.pop('sort_order', None) if 'sort_on' in query and 'sort_order' not in query: newquery.pop('sort_order', None) newquery.update(query) notify(QueryWillBeExecutedEvent(context, newquery)) # code above is unchanged from original code # example of query: #{'sort_order': 'descending', # 'Language': ['fr', ''], # 'portal_type': {'query': ['operationalobjective']}, # 'sort_on': 'created', # ':has_child': {'query': { # 'portal_type': {'query': ['pstaction']}, # 'planned_end_date': {'query': DateTime('2016/08/12 12:49:6.218718 GMT+2'), 'range': 'max'}}}} has_child_filters = [ k for k in newquery.keys() if k.startswith(':has_child') ] if len(has_child_filters) > 1: raise Exception('We only support one :has_child filter') if has_child_filters: sort_on = newquery.pop('sort_on', None) sort_order = newquery.pop('sort_order', None) limit = newquery.pop('limit', None) b_start = int(newquery.pop('b_start', 0)) b_size = newquery.pop('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit has_child_filter = deepcopy( newquery[has_child_filters[0]]['query']) # be sure we don't do a useless sort and we have all results has_child_filter.pop('sort_on', None) has_child_filter.pop('sort_order', None) has_child_filter.pop('limit', None) has_child_filter.pop('b_size', None) children_results = search(**has_child_filter) parentRIDs = IISet() for b in children_results: # TODO: possible optimization is to add parentRID as brain metadata parentRID = ctool._catalog.uids.get('/'.join( b.getPath().split('/')[:-1])) parentRIDs.add(parentRID) brains = search(**newquery) # brains should be a LazyMap, access direct to brains._seq which is a list of rids # instead of doing IISet(brain.getRID() for brain in brains) rs = weightedIntersection(IISet(brains._seq), parentRIDs)[1] rlen = len(rs) if sort_on is not None: # We only support single sort order sort_index = ctool._catalog.indexes[sort_on] reverse = 1 if sort_order == 'descending' else 0 brains = ctool._catalog.sortResults(rs, sort_index, reverse, limit, merge=1, actual_result_count=rlen, b_start=b_start, b_size=b_size) else: brains = brains.__class__(brains._func, rs, rlen, rlen) else: brains = search(**newquery) return brains
def test_sortResults_reversed(self): brains = self._catalog({'att1': 'att1'}) rs = IISet([b.getRID() for b in brains]) si = self._catalog.getIndex('num') result = self._catalog.sortResults(rs, si, reverse=True) self.assertEqual([r.num for r in result], list(reversed(range(100))))
def search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path,level). level >= 0 starts searching at the given level level < 0 not implemented yet """ if isinstance(path, basestring): startlevel = default_level else: startlevel = int(path[1]) path = path[0] absolute_path = isinstance(path, basestring) and path.startswith('/') comps = filter(None, path.split('/')) orig_comps = [''] + comps[:] # Optimization - avoid using the root set # as it is common for all objects anyway and add overhead # There is an assumption about catalog/index having # the same container as content if default_level == 0: indexpath = list(filter(None, self.getPhysicalPath())) while min(len(indexpath), len(comps)): if indexpath[0] == comps[0]: del indexpath[0] del comps[0] startlevel += 1 else: break if len(comps) == 0: if depth == -1 and not navtree: return IISet(self._unindex.keys()) # Make sure that we get depth = 1 if in navtree mode # unless specified otherwise orig_depth = depth if depth == -1: depth = 0 or navtree # Optimized navtree starting with absolute path if absolute_path and navtree and depth == 1 and default_level == 0: set_list = [] # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: set_list.append(self._index_parents[parent_path]) except KeyError: pass return multiunion(set_list) # Optimized breadcrumbs elif absolute_path and navtree and depth == 0 and default_level == 0: item_list = IISet() # Insert root element if navtree_start >= len(orig_comps): navtree_start = 0 # create a set of parent paths to search for i in range(len(orig_comps), navtree_start, -1): parent_path = '/'.join(orig_comps[:i]) parent_path = parent_path and parent_path or '/' try: item_list.insert(self._index_items[parent_path]) except KeyError: pass return item_list # Specific object search elif absolute_path and orig_depth == 0 and default_level == 0: try: return IISet([self._index_items[path]]) except KeyError: return IISet() # Single depth search elif absolute_path and orig_depth == 1 and default_level == 0: # only get objects contained in requested folder try: return self._index_parents[path] except KeyError: return IISet() # Sitemaps, relative paths, and depth queries elif startlevel >= 0: pathset = None # Same as pathindex navset = None # For collecting siblings along the way depthset = None # For limiting depth if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(startlevel): navset = self._index[None][startlevel] for level in range(startlevel, startlevel + len(comps) + depth): if level - startlevel < len(comps): comp = comps[level - startlevel] if not self._index.has_key( comp) or not self._index[comp].has_key(level): # Navtree is inverse, keep going even for # nonexisting paths if navtree: pathset = IISet() else: return IISet() else: pathset = intersection(pathset, self._index[comp][level]) if navtree and depth and \ self._index.has_key(None) and \ self._index[None].has_key(level+depth): navset = union( navset, intersection(pathset, self._index[None][level + depth])) if level - startlevel >= len(comps) or navtree: if self._index.has_key(None) and self._index[None].has_key( level): depthset = union( depthset, intersection(pathset, self._index[None][level])) if navtree: return union(depthset, navset) or IISet() elif depth: return depthset or IISet() else: return pathset or IISet() else: results = IISet() for level in range(0, self._depth + 1): ids = None error = 0 for cn in range(0, len(comps)): comp = comps[cn] try: ids = intersection(ids, self._index[comp][level + cn]) except KeyError: error = 1 if error == 0: results = union(results, ids) return results