def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = IITreeSet(old_wid2w.keys()) new_widset = IITreeSet(new_wid2w.keys()) in_both_widset = intersection(old_widset, new_widset) only_old_widset = difference(old_widset, in_both_widset) only_new_widset = difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = WidCode.encode(new_wids) return len(new_wids)
def index_object(self, documentId, obj, threshold=None): """ wrapper to handle indexing of multiple attributes """ # needed for backward compatibility try: fields = self._indexed_fields except: fields = [ self.id ] res = 0 all_wids = [] for attr in fields: try: wids = self._index_object(documentId, obj, threshold, attr) if wids is not None: all_wids.extend(wids) except: pass # get rid of words removed by reindexing try: o_wids = IISet(self._storage.getWordIdsForDocId(documentId)) except KeyError: o_wids = IISet() all_wids_set = IISet(all_wids) remove_wids = difference(o_wids, all_wids_set) insert_wids = difference(all_wids_set, o_wids) insert_dict = {} # hash wids to dict for performance reasons for wid in insert_wids.keys(): insert_dict[wid] = 1 if len(remove_wids) > 0: self._storage.removeWordIdsForDocId(documentId, remove_wids) if all_wids: self._storage.insert([w for w in all_wids if insert_dict.has_key(w)], documentId) return len(all_wids)
def _update(self, documentId, val, oldval, threshold): val = IITreeSet(val) oldval = IITreeSet(self._unindexVal2Val(oldval)) add = difference(val, oldval) rem = difference(oldval, val) if add: self._indexValue(documentId, add, threshold) if rem: self._unindexValue(documentId, rem) return len(add)
def _update(self,documentId,val,oldval,threshold): add= difference(val,oldval) rem= difference(oldval,val) if add: self._indexValue(documentId,add,threshold) if rem: self._unindexValue(documentId,rem) # optimize transaction size by not writing _unindex bucket if len(rem) < 100: for x in rem: oldval.remove(x) # sad that we do not have a mass remove oldval.update(add) else: oldval.clear(); oldval.update(val) return len(add),
def _update(self, documentId, val, oldval, threshold): add = difference(val, oldval) rem = difference(oldval, val) if add: self._indexValue(documentId, add, threshold) if rem: self._unindexValue(documentId, rem) # optimize transaction size by not writing _unindex bucket if len(rem) < 100: for x in rem: oldval.remove(x) # sad that we do not have a mass remove oldval.update(add) else: oldval.clear() oldval.update(val) return len(add),
def query_index(self, record, resultset=None): index = self._index indexed = self._index_value for key in record.keys: if bool(key) is bool(indexed): # If we match the indexed value, check index return intersection(index, resultset) else: # Otherwise, remove from resultset or _unindex if resultset is None: return union(difference(self._unindex, index), IISet([])) else: return difference(resultset, index) return IISet()
def query_index(self, record, resultset=None): level = record.get("level", 0) operator = record.get('operator', self.useOperator).lower() depth = getattr(record, 'depth', -1) # use getattr to get 0 value navtree = record.get('navtree', 0) navtree_start = record.get('navtree_start', 0) exclude_root = record.get('exclude_root', 0) # depending on the operator we use intersection of union if operator == "or": set_func = union else: set_func = intersection result = None for k in record.keys: rows = self.search(k, level, depth, navtree, navtree_start, resultset=resultset) if exclude_root: root = self._index_items.get(k) rows = difference(rows, root) result = set_func(result, rows) if result: return result return IISet()
def missing_entries_for_index(self, catalog, index_name): """ Return the difference between catalog and index ids """ index = catalog._catalog.getIndex(index_name) referenced = IISet(index.referencedObjects()) return (difference(IISet(catalog._catalog.paths), referenced), len(catalog) - len(referenced))
def generate(seq, vqs, mv): if not vqs: yield 0, seq; return vqs = vqs[:] # avoid side effects v,q = vqs.pop(); mv -= v q = And(LiteralResultSet(seq), q) qr = _eval(q, cat) if qr: feed1 = generate(qr, vqs, mv) seq = difference(seq, qr) else: feed1 = None feed2 = seq and generate(seq, vqs, mv) or None def fetch1(): if feed1 is None: return None try: val, subseq = feed1.next(); return val + v, subseq except StopIteration: return None def fetch2(): if feed2 is None: return None try: return feed2.next() except StopIteration: return None g1 = fetch1() # largest value from "feed1" only while g1 is not None and g1[0] > mv: yield g1; g1 = fetch1() # merge largest values from "feed1" and "feed2" g2 = fetch2() while g1 is not None and g2 is not None: v1 = g1[0]; v2 = g2[0] if v1 > v2: yield g1; g1 = fetch1() elif v2 > v1: yield g2; g2 = fetch2() # Note: g1[1] was copied (by the "intersection" above); therfore, # we can destructively change it else: g1[1].update(g2[1]); yield g1; g1 = fetch1(); g2 = fetch2() # handle feed1 while g1 is not None: yield g1; g1 = fetch1() # handle feed2 while g2 is not None: yield g2; g2 = fetch2()
def group(self, seq): sortIndex = self._sortIndex sortReverse = self._sortReverse ns = len(seq) ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet() hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()) items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids) hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result m = OOBTree() keyFor = getattr(sortIndex, 'keyForDocument', None) # work around "nogopip" bug: it defines "keyForDocument" as an integer if not callable(keyFor): # this will fail, when the index neither defines a reasonable # "keyForDocument" nor "documentToKeyMap". In this case, # the index cannot be used for sorting. keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc] noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc) continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items) items.reverse() for i in items: yield i if noValue: yield None, noValue
def missing_entries_for_index(self, catalog, index_name): """ Return the difference between catalog and index ids """ index = catalog._catalog.getIndex(index_name) referenced = IISet(index.referencedObjects()) return ( difference(IISet(catalog._catalog.paths), referenced), len(catalog) - len(referenced) )
def _group(self, seq): spec = self._spec; cat = self._cat vqs = spec._getValueQuerySequence() for i in xrange(len(vqs)-1,-1,-1): v,q = vqs[i] q = And(LiteralResultSet(seq), q) qr = _eval(q, cat) if qr: yield v, qr; seq = difference(seq, qr) if not seq: return yield 0, seq
def _apply_index(self, request, resultset=None): record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index for key in record.keys: if key: # If True, check index return (intersection(index, resultset), (self.id, )) else: # Otherwise, remove from resultset or _unindex if resultset is None: return (union(difference(self._unindex, index), IISet([])), (self.id, )) else: return (difference(resultset, index), (self.id, )) return (IISet(), (self.id, ))
def _apply_index(self, request, resultset=None): record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index indexed = self._index_value for key in record.keys: if bool(key) is bool(indexed): # If we match the indexed value, check index return (intersection(index, resultset), (self.id,)) else: # Otherwise, remove from resultset or _unindex if resultset is None: return (union(difference(self._unindex, index), IISet([])), (self.id,)) else: return (difference(resultset, index), (self.id,)) return (IISet(), (self.id,))
def query_index(self, record, resultset=None): cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record, resultset) cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return cached else: return difference(resultset, cached) term = self._convertDateTime(record.keys[0]) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion( [bounded, until_only, since_only, self._always]) if cache is not None: cache[cachekey] = result return result else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([since, since_only, until_only, until]) if cache is not None: cache[cachekey] = result return difference(resultset, result)
def query_index(self, record, resultset=None): cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record, resultset) cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return cached else: return difference(resultset, cached) term = self._convertDateTime(record.keys[0]) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion([bounded, until_only, since_only, self._always]) if cache is not None: cache[cachekey] = result return result else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([since, since_only, until_only, until]) if cache is not None: cache[cachekey] = result return difference(resultset, result)
def _eval(self,context): csq = self._classifySubqueries() if csq['empty']: return IISet() # empty result nsq = csq['lookup'] + csq['complex'] + csq['indexed'] notsq = csq['notQ'] if not nsq and not notsq: # an empty 'And' query return context._getObjectIds() if not nsq: nsq.append(notsq.pop()) r = None for q in nsq: r = intersection(r, q._eval(context)) for q in notsq: r = difference(r, q._query._eval(context)) return r
def _group(self, seq): spec = self._spec cat = self._cat vqs = spec._getValueQuerySequence() for i in xrange(len(vqs) - 1, -1, -1): v, q = vqs[i] q = And(LiteralResultSet(seq), q) qr = _eval(q, cat) if qr: yield v, qr seq = difference(seq, qr) if not seq: return yield 0, seq
def timing(self, small, large): new = 0.0 old = 0.0 c = 0.0 loop = LOOP for i in xrange(10): start = time() difference(small, large) old+=(time()-start) start = time() difference2(small, large) new+=(time()-start) if ciidifference is not None: start = time() ciidifference(small, large) c+=(time()-start) print 'Old x%s: %.6f' % (loop, old) print 'New x%s: %.6f' % (loop, new) if ciidifference is not None: print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, old / c)
def timing(self, small, large): new = 0.0 old = 0.0 c = 0.0 loop = LOOP for i in xrange(10): start = time() difference(small, large) old += (time() - start) start = time() difference2(small, large) new += (time() - start) if ciidifference is not None: start = time() ciidifference(small, large) c += (time() - start) print 'Old x%s: %.6f' % (loop, old) print 'New x%s: %.6f' % (loop, new) if ciidifference is not None: print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, old / c)
def index_object(self, document_id, obj, threshold=None): """Index an object. 'document_id' is the integer ID of the document. 'obj' is the object to be indexed. 'threshold' is the number of words to process between committing subtransactions. If None, subtransactions are disabled. """ new_ranges = self._get_object_data(obj, self.id) if new_ranges: new_set = IISet(map(self.__index_range, new_ranges)) else: new_set = IISet() old_set = self._unindex.get(document_id, IISet()) new_entries = difference(new_set, old_set) expired_entries = difference(old_set, new_set) if not (new_entries or expired_entries): # nothing to do, bail out ! return 0 for expired_entry in expired_entries: self.__remove_in_index_set(self._unindex, document_id, expired_entry) if self.__remove_in_index_set(self._index, expired_entry, \ document_id): # range is not used anymore, retire it self.__unindex_range(expired_entry) for new_entry in new_entries: if self.__insert_in_index_set(self._unindex, document_id, new_entry): self._length.change(1) self.__insert_in_index_set(self._index, new_entry, document_id) return 1
def _eval(self, context): csq = self._classifySubqueries() if csq['empty']: return IISet() # empty result nsq = csq['lookup'] + csq['complex'] + csq['indexed'] notsq = csq['notQ'] if not nsq and not notsq: # an empty 'And' query return context._getObjectIds() if not nsq: nsq.append(notsq.pop()) r = None for q in nsq: r = intersection(r, q._eval(context)) for q in notsq: r = difference(r, q._query._eval(context)) return r
def below(self, arg): """Find all resources at or below path, within the limits given. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== parts = path.split(os.sep) rids = None for level in range(len(parts)): rids = intersection(rids, self.parts[(level, parts[level])]) if rids is None: return IISet() # short-cut # Limits # ====== # Remove rids that are above any upper limit, and then only include rids # that are above any lower limit. Limits are relative to the level of # the requested path. if upper is not None: upper += level for i in range(level, upper): if i not in self.levels: break rids = difference(rids, self.levels[i]) if lower is not None: lower += level _rids = [] for i in range(level, lower): if i not in self.levels: break _rids.append(self.levels[i]) rids = intersection(rids, multiunion(_rids)) return rids
def group(self, seq): sortIndex = self._sortIndex; sortReverse = self._sortReverse ns = len(seq); ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet(); hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()); items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids); hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result m = OOBTree() keyFor = getattr(sortIndex, 'keyForDocument', None) # work around "nogopip" bug: it defines "keyForDocument" as an integer if not callable(keyFor): # this will fail, when the index neither defines a reasonable # "keyForDocument" nor "documentToKeyMap". In this case, # the index cannot be used for sorting. keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc] noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc); continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items); items.reverse() for i in items: yield i if noValue: yield None, noValue
def testQuery(record, expect=1): cache = idx.getRequestCache() cache.clear() # First query res1 = idx._apply_index(record) # Cache set? self.assertEqual(cache._sets, expect) # Cache miss? self.assertEqual(cache._misses, expect) # Second Query res2 = idx._apply_index(record) # Cache hit? self.assertEqual(cache._hits, expect) # Check if result of second query is equal to first query result = difference(res1[0], res2[0]) self.assertEqual(len(result), 0)
def group(self, seq): sortIndex = self._sortIndex; sortReverse = self._sortReverse ns = len(seq); ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet(); hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()); items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids); hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result keyFor = sortIndex.keyForDocument; m = OOBTree() noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc); continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items); items.reverse() for i in items: yield i if noValue: yield None, noValue
def executeQuery(self, index): L = [] Nots = [] for subnode in self.getValue(): if subnode.nodeType() == "NOT": r = subnode.getValue().executeQuery(index) # If None, technically it matches every doc, but we treat # it as if it matched none (we want # real_word AND NOT stop_word # to act like plain real_word). if r is not None: Nots.append((r, 1)) else: r = subnode.executeQuery(index) # If None, technically it matches every doc, so needn't be # included. if r is not None: L.append((r, 1)) set = mass_weightedIntersection(L) if Nots: notset = mass_weightedUnion(Nots) set = difference(set, notset) return set
def insertDocument(self, docid, widlist): if not self._doc2wid.has_key(docid): self._length.change(1) enc_widlist = encode(widlist) old_enc_widlist = self._doc2wid.get(docid) if old_enc_widlist is not None: old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance removed_wordids = [] if old_enc_widlist != enc_widlist : self._doc2wid[docid] = _PS(enc_widlist) if old_enc_widlist is not None: old_widlist = IISet(decode(old_enc_widlist)) removed_wordids = difference(old_widlist, IISet(widlist)) tree = self._wid2doc tree_has = tree.has_key count = 0 for wid in widlist: count += 1 if not tree_has(wid): tree[wid] = DocidList([docid]) else: if not docid in tree[wid]: tree[wid].insert(docid) for wid in removed_wordids: if tree_has(wid): try: tree[wid].remove(docid) except KeyError: pass self._docweight[docid] = count
def insertDocument(self, docid, widlist): if not self._doc2wid.has_key(docid): self._length.change(1) enc_widlist = encode(widlist) old_enc_widlist = self._doc2wid.get(docid) if old_enc_widlist is not None: old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance removed_wordids = [] if old_enc_widlist != enc_widlist: self._doc2wid[docid] = _PS(enc_widlist) if old_enc_widlist is not None: old_widlist = IISet(decode(old_enc_widlist)) removed_wordids = difference(old_widlist, IISet(widlist)) tree = self._wid2doc tree_has = tree.has_key count = 0 for wid in widlist: count += 1 if not tree_has(wid): tree[wid] = DocidList([docid]) else: if not docid in tree[wid]: tree[wid].insert(docid) for wid in removed_wordids: if tree_has(wid): try: tree[wid].remove(docid) except KeyError: pass self._docweight[docid] = count
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. If the request does not have a key which matches the "id" of the index instance, then None is returned. If the request *does* have a key which matches the "id" of the index instance, one of a few things can happen: - if the value is a blank string, None is returned (in order to support requests from web forms where you can't tell a blank string from empty). - if the value is a nonblank string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are "or" and "and". If None is not returned as a result of the abovementioned constraints, two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. FAQ answer: to search a Field Index for documents that have a blank string as their value, wrap the request value up in a tuple ala: request = {'id':('',)} """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) if not record.keys and not_parm: # convert into indexed format not_parm = map(self._convert, not_parm) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = map(self._convert, record.keys) # experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % escape(operator)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id,) if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) r = multiunion(tmp) else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: raise TypeError('None cannot be in an index.') s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can't possibly get a bigger result continue # If intersection, we can't possibly get a smaller result return IISet(), (self.id,) elif isinstance(s, int): s = IISet((s,)) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id,) if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) else: r = multiunion(setlist) else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) r = resultset for s in setlist: r = intersection(r, s) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id,) if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r, (self.id,)
def index_object(self, documentId, obj, threshold=None): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. o Repeat by recurdef - a RFC2445 reccurence definition string """ returnStatus = 0 try: date_attr = getattr(obj, self.id) if safe_callable(date_attr): date_attr = date_attr() except AttributeError: return returnStatus recurdef = getattr(obj, self.attr_recurdef, None) if safe_callable(recurdef): recurdef = recurdef() if not recurdef: dates = [pydt(date_attr)] else: until = getattr(obj, self.attr_until, None) if safe_callable(until): until = until() dates = recurrence_sequence_ical(date_attr, recrule=recurdef, until=until) newvalues = IISet(map(dt2int, dates)) oldvalues = self._unindex.get(documentId, _marker) if oldvalues is not _marker: oldvalues = IISet(oldvalues) if oldvalues is not _marker and newvalues is not _marker\ and not difference(newvalues, oldvalues)\ and not difference(oldvalues, newvalues): # difference is calculated relative to first argument, so we have to # use it twice here return returnStatus if oldvalues is not _marker: for oldvalue in oldvalues: self.removeForwardIndexEntry(oldvalue, documentId) if newvalues is _marker: try: del self._unindex[documentId] except ConflictError: raise except: LOG.error("Should not happen: oldvalues was there," " now it's not, for document with id %s" % documentId) if newvalues is not _marker: inserted = False for value in newvalues: self.insertForwardIndexEntry(value, documentId) inserted = True if inserted: # store tuple values in reverse index entries for sorting self._unindex[documentId] = tuple(newvalues) returnStatus = 1 return returnStatus
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id+'_encoding') if safe_callable(encoding ): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source,encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word==last: continue last=word wordScores[word]=wordScores.get(word,0)+1 # Convert scores to use wids: widScores=IIBucket() getWid=lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)]=score del wordScores currentWids=IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert=self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids=widScores.keys() if wids != currentWids.keys(): self._unindex[documentId]=wids return len(wids)
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. If the request does not have a key which matches the "id" of the index instance, then None is returned. If the request *does* have a key which matches the "id" of the index instance, one of a few things can happen: - if the value is a blank string, None is returned (in order to support requests from web forms where you can't tell a blank string from empty). - if the value is a nonblank string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are "or" and "and". If None is not returned as a result of the abovementioned constraints, two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. FAQ answer: to search a Field Index for documents that have a blank string as their value, wrap the request value up in a tuple ala: request = {'id':('',)} """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) if not record.keys and not_parm: # convert into indexed format not_parm = map(self._convert, not_parm) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = map(self._convert, record.keys) # experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % escape(operator)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id, ) if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) r = multiunion(tmp) else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) else: # not a range search # Filter duplicates setlist = [] for k in record.keys: s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can't possibly get a bigger result continue # If intersection, we can't possibly get a smaller result return IISet(), (self.id, ) elif isinstance(s, int): s = IISet((s, )) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result, (self.id, ) if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) else: r = multiunion(setlist) else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) r = resultset for s in setlist: r = intersection(r, s) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r, (self.id, )
def query_index(self, record, resultset=None): """Search the index with the given IndexQuery object. If the query has a key which matches the 'id' of the index instance, one of a few things can happen: - if the value is a string, turn the value into a single-element sequence, and proceed. - if the value is a sequence, return a union search. - If the value is a dict and contains a key of the form '<index>_operator' this overrides the default method ('or') to combine search results. Valid values are 'or' and 'and'. """ index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) operator = record.operator cachekey = None cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record) if cachekey is not None: cached = None if operator == 'or': cached = cache.get(cachekey, None) else: cached_setlist = cache.get(cachekey, None) if cached_setlist is not None: r = resultset for s in cached_setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a # smaller result if not r: break cached = r if cached is not None: if isinstance(cached, int): cached = IISet((cached, )) if not_parm: not_parm = list(map(self._convert, not_parm)) exclude = self._apply_not(not_parm, resultset) cached = difference(cached, exclude) return cached if not record.keys and not_parm: # convert into indexed format not_parm = list(map(self._convert, not_parm)) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = list(map(self._convert, record.keys)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = "range" opr_args = [] if range_parm.find("min") > -1: opr_args.append("min") if range_parm.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) r = multiunion(tmp) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s, )) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp # 'r' is not invariant of resultset. Thus, we # have to remember 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: # Prevent None from being looked up. None doesn't # have a valid ordering definition compared to any # other object. BTrees 4.0+ will throw a TypeError # "object has default comparison". continue s = index.get(k, None) # If None, try to bail early if s is None: if operator == 'or': # If union, we can possibly get a bigger result continue # If intersection, we can't possibly get a smaller result if cachekey is not None: # If operator is 'and', we have to cache a list of # IISet objects cache[cachekey] = [IISet()] return IISet() elif isinstance(s, int): s = IISet((s, )) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result, )) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist'. But # this is maybe a performance killer. So we do not cache. # if cachekey is not None: # cache[cachekey] = multiunion(setlist) else: r = multiunion(setlist) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break if isinstance(r, int): r = IISet((r, )) if r is None: return IISet() if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r
def generate(seq, vqs, mv): if not vqs: yield 0, seq return vqs = vqs[:] # avoid side effects v, q = vqs.pop() mv -= v q = And(LiteralResultSet(seq), q) qr = _eval(q, cat) if qr: feed1 = generate(qr, vqs, mv) seq = difference(seq, qr) else: feed1 = None feed2 = seq and generate(seq, vqs, mv) or None def fetch1(): if feed1 is None: return None try: val, subseq = feed1.next() return val + v, subseq except StopIteration: return None def fetch2(): if feed2 is None: return None try: return feed2.next() except StopIteration: return None g1 = fetch1() # largest value from "feed1" only while g1 is not None and g1[0] > mv: yield g1 g1 = fetch1() # merge largest values from "feed1" and "feed2" g2 = fetch2() while g1 is not None and g2 is not None: v1 = g1[0] v2 = g2[0] if v1 > v2: yield g1 g1 = fetch1() elif v2 > v1: yield g2 g2 = fetch2() # Note: g1[1] was copied (by the "intersection" above); therfore, # we can destructively change it else: g1[1].update(g2[1]) yield g1 g1 = fetch1() g2 = fetch2() # handle feed1 while g1 is not None: yield g1 g1 = fetch1() # handle feed2 while g2 is not None: yield g2 g2 = fetch2()
def inverseResultSet(all_docids, set): """ perform difference between all docids and a resultset """ docids = difference(DocidList(all_docids), set.getDocids()) return ResultSet(docids, set.getWords())
def _eval(self, context): return difference(context._getObjectIds(), self._query._eval(context))
def difference(self, *args): from BTrees.IIBTree import difference return difference(*args)
def index_object(self, documentId, obj, threshold=None): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. o Repeat by recurdef - a RFC2445 reccurence definition string """ returnStatus = 0 try: date_attr = getattr(obj, self.id) if safe_callable(date_attr): date_attr = date_attr() except AttributeError: return returnStatus recurdef = getattr(obj, self.attr_recurdef, None) if safe_callable(recurdef): recurdef = recurdef() if not recurdef: dates = [pydt(date_attr)] else: until = getattr(obj, self.attr_until, None) if safe_callable(until): until = until() dates = recurrence_sequence_ical( date_attr, recrule=recurdef, until=until) newvalues = IISet(map(dt2int, dates)) oldvalues = self._unindex.get(documentId, _marker) if oldvalues is not _marker: oldvalues = IISet(oldvalues) if oldvalues is not _marker and newvalues is not _marker\ and not difference(newvalues, oldvalues)\ and not difference(oldvalues, newvalues): # difference is calculated relative to first argument, so we have # to use it twice here return returnStatus if oldvalues is not _marker: for oldvalue in oldvalues: self.removeForwardIndexEntry(oldvalue, documentId) if newvalues is _marker: try: del self._unindex[documentId] except ConflictError: raise except Exception: LOG.error("Should not happen: oldvalues was there," " now it's not, for document with id %s" % documentId) if newvalues is not _marker: inserted = False for value in newvalues: self.insertForwardIndexEntry(value, documentId) inserted = True if inserted: # store tuple values in reverse index entries for sorting self._unindex[documentId] = tuple(newvalues) returnStatus = 1 if returnStatus > 0: self._increment_counter() return returnStatus
def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in 'request', which should be a mapping object. If the request does not contain the needed parameters, then return None. Otherwise return two objects. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. """ iid = self.id record = parseIndexRequest(request, iid, self.query_options) if record.keys is None: return None term = self._convertDateTime(record.keys[0]) REQUEST = aq_get(self, 'REQUEST', None) if REQUEST is not None: catalog = aq_parent(aq_parent(aq_inner(self))) if catalog is not None: key = self._cache_key(catalog) cache = REQUEST.get(key, None) tid = isinstance(term, int) and term / 10 or 'None' if resultset is None: cachekey = '_daterangeindex_%s_%s' % (iid, tid) else: cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid) if cache is None: cache = REQUEST[key] = RequestCache() else: cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return (cached, (self._since_field, self._until_field)) else: return (difference(resultset, cached), (self._since_field, self._until_field)) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) # Total result is bound by resultset if REQUEST is None: until = intersection(resultset, until) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion([bounded, until_only, since_only, self._always]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (result, (self._since_field, self._until_field)) else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([since, since_only, until_only, until]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (difference(resultset, result), (self._since_field, self._until_field))
def and_not(self, x): return self.__class__( difference(self._dict, x._dict), self._words, self._index, )
def checkCatalog(path,indexes): """ perform some consistency checks on a ZCatalog instance""" root = Zope2.app() try: catalog = root.unrestrictedTraverse(path) except AttributeError: print 'Error: catalog object not found' sys.exit(1) # get Catalog instance _cat = catalog._catalog # check Catalog internal BTrees l_data = list(_cat.data.keys()) l_data.sort() l_uids = list(_cat.uids.values()) l_uids.sort() l_paths = list(_cat.data.keys()) l_paths.sort() print "Checking catalog internal BTrees" print "\tINFO: Mapping data: %d entries" % len(l_data) print "\tINFO: Mapping uids: %d entries" % len(l_uids) print "\tINFO: Mapping paths: %d entries" % len(l_paths) if l_data == l_uids: print "\tOK: Mapping data equals Mapping uids" else: print "\tERR: Mapping data does not equal Mapping uids" if l_data == l_paths: print "\tOK: Mapping data equals Maaping paths" else: print "\tERR: Mapping data does not equal Maaping paths" # check BTrees of indexes for id,idx in _cat.indexes.items(): if indexes and not idx.meta_type in indexes: continue print "Checking index '%s' (type: %s)" % (id, idx.meta_type) if idx.meta_type in ['FieldIndex','KeywordIndex']: # check forward entries RIDS = IISet() for key, rids in idx._index.items(): if isinstance(rids,IntType): RIDS.insert( rids ) else: map(RIDS.insert , rids.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff)!=0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) elif idx.meta_type in ['PathIndex']: RIDS = IISet() for rids in map(None,idx._index.values()): map(RIDS.insert , rids.values()[0]) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff)!=0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) if idx.meta_type in ['FieldIndex','KeywordIndex','PathIndex']: # check backward entries RIDS = IISet(idx._unindex.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff)!=0: print '\tERR: Problem with backward entries' print '\tERR: too much backward entries:', diff else: print '\tOK: Backward entries (%d entries)' % (len(RIDS))
def _apply_index(self, request, resultset=None): """ Apply the index to query parameters given in 'request', which should be a mapping object. If the request does not contain the needed parameters, then return None. Otherwise return two objects. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. """ iid = self.id record = parseIndexRequest(request, iid, self.query_options) if record.keys is None: return None term = self._convertDateTime(record.keys[0]) REQUEST = aq_get(self, 'REQUEST', None) if REQUEST is not None: catalog = aq_parent(aq_parent(aq_inner(self))) if catalog is not None: key = self._cache_key(catalog) cache = REQUEST.get(key, None) tid = isinstance(term, int) and term / 10 or 'None' if resultset is None: cachekey = '_daterangeindex_%s_%s' % (iid, tid) else: cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid) if cache is None: cache = REQUEST[key] = RequestCache() else: cached = cache.get(cachekey, None) if cached is not None: if resultset is None: return (cached, (self._since_field, self._until_field)) else: return (difference(resultset, cached), (self._since_field, self._until_field)) if resultset is None: # Aggregate sets for each bucket separately, to avoid # large-small union penalties. until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) # Total result is bound by resultset if REQUEST is None: until = intersection(resultset, until) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) # Merge from smallest to largest. result = multiunion( [bounded, until_only, since_only, self._always]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (result, (self._since_field, self._until_field)) else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([until_only, since_only, until, since]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return (difference(resultset, result), (self._since_field, self._until_field))
def daterangeindex_apply_index(self, request, cid='', res=None): record = parseIndexRequest(request, self.getId()) if record.keys is None: return None term = self._convertDateTime(record.keys[0]) REQUEST = getattr(self, 'REQUEST', None) if REQUEST is not None: catalog = aq_parent(aq_parent(aq_inner(self))) if catalog is not None: key = '%s_%s' % (catalog.getId(), catalog.getCounter()) cache = REQUEST.get(key, None) tid = isinstance(term, int) and term / 10 or 'None' index_id = self.getId() if res is None: cachekey = '_daterangeindex_%s_%s' % (index_id, tid) else: cachekey = '_daterangeindex_inverse_%s_%s' % (index_id, tid) if cache is None: cache = REQUEST[key] = RequestCache() else: cached = cache.get(cachekey, None) if cached is not None: if res is None: return cached, (self._since_field, self._until_field) else: return (difference(res, cached), (self._since_field, self._until_field)) if res is None: # # Aggregate sets for each bucket separately, to avoid # large-small union penalties. # XXX Does this apply for multiunion? # until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) # Total result is bound by res if REQUEST is None: until = intersection(res, until) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) result = multiunion([bounded, until_only, since_only, self._always]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return result, (self._since_field, self._until_field) else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([until_only, since_only, until, since]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return difference(res, result), (self._since_field, self._until_field)
def checkCatalog(path, indexes): """ perform some consistency checks on a ZCatalog instance""" root = Zope2.app() try: catalog = root.unrestrictedTraverse(path) except AttributeError: print 'Error: catalog object not found' sys.exit(1) # get Catalog instance _cat = catalog._catalog # check Catalog internal BTrees l_data = list(_cat.data.keys()) l_data.sort() l_uids = list(_cat.uids.values()) l_uids.sort() l_paths = list(_cat.data.keys()) l_paths.sort() print "Checking catalog internal BTrees" print "\tINFO: Mapping data: %d entries" % len(l_data) print "\tINFO: Mapping uids: %d entries" % len(l_uids) print "\tINFO: Mapping paths: %d entries" % len(l_paths) if l_data == l_uids: print "\tOK: Mapping data equals Mapping uids" else: print "\tERR: Mapping data does not equal Mapping uids" if l_data == l_paths: print "\tOK: Mapping data equals Maaping paths" else: print "\tERR: Mapping data does not equal Maaping paths" # check BTrees of indexes for id, idx in _cat.indexes.items(): if indexes and not idx.meta_type in indexes: continue print "Checking index '%s' (type: %s)" % (id, idx.meta_type) if idx.meta_type in ['FieldIndex', 'KeywordIndex']: # check forward entries RIDS = IISet() for key, rids in idx._index.items(): if isinstance(rids, IntType): RIDS.insert(rids) else: map(RIDS.insert, rids.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) elif idx.meta_type in ['PathIndex']: RIDS = IISet() for rids in map(None, idx._index.values()): map(RIDS.insert, rids.values()[0]) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with forward entries' print '\tERR: too much forward entries:', diff else: print '\tOK: Forward entries (%d entries)' % (len(RIDS)) if idx.meta_type in ['FieldIndex', 'KeywordIndex', 'PathIndex']: # check backward entries RIDS = IISet(idx._unindex.keys()) diff = difference(RIDS, IISet(_cat.data.keys())) if len(diff) != 0: print '\tERR: Problem with backward entries' print '\tERR: too much backward entries:', diff else: print '\tOK: Backward entries (%d entries)' % (len(RIDS))
def query_index(self, record, resultset=None): """Search the index with the given IndexQuery object. If not `None`, the resultset argument indicates that the search result is relevant only on this set, i.e. everything outside resultset is of no importance. The index can use this information for optimizations. """ index = self._index r = None opr = None # not / exclude parameter not_parm = record.get('not', None) operator = record.operator cachekey = None cache = self.getRequestCache() if cache is not None: cachekey = self.getRequestCacheKey(record) if cachekey is not None: cached = None if operator == 'or': cached = cache.get(cachekey, None) else: cached_setlist = cache.get(cachekey, None) if cached_setlist is not None: r = resultset for s in cached_setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a # smaller result if not r: break cached = r if cached is not None: if isinstance(cached, int): cached = IISet((cached, )) if not_parm: not_parm = list(map(self._convert, not_parm)) exclude = self._apply_not(not_parm, resultset) cached = difference(cached, exclude) return cached if not record.keys and not_parm: # convert into indexed format not_parm = list(map(self._convert, not_parm)) # we have only a 'not' query record.keys = [k for k in index.keys() if k not in not_parm] else: # convert query arguments into indexed format record.keys = list(map(self._convert, record.keys)) # Range parameter range_parm = record.get('range', None) if range_parm: opr = 'range' opr_args = [] if range_parm.find('min') > -1: opr_args.append('min') if range_parm.find('max') > -1: opr_args.append('max') if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == 'range': # range search if 'min' in opr_args: lo = min(record.keys) else: lo = None if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) # If we only use one key, intersect and return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) r = multiunion(tmp) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first tmp = [] for s in setlist: if isinstance(s, int): s = IISet((s,)) tmp.append(s) if len(tmp) > 2: setlist = sorted(tmp, key=len) else: setlist = tmp # 'r' is not invariant of resultset. Thus, we # have to remember 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: # the result is bound by the resultset r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break else: # not a range search # Filter duplicates setlist = [] for k in record.keys: if k is None: # Prevent None from being looked up. None doesn't # have a valid ordering definition compared to any # other object. BTrees 4.0+ will throw a TypeError # "object has default comparison". continue try: s = index.get(k, None) except TypeError: # key is not valid for this Btree so the value is None LOG.error( '%(context)s: query_index tried ' 'to look up key %(key)r from index %(index)r ' 'but key was of the wrong type.', dict( context=self.__class__.__name__, key=k, index=self.id, ) ) s = None # If None, try to bail early if s is None: if operator == 'or': # If union, we can possibly get a bigger result continue # If intersection, we can't possibly get a smaller result if cachekey is not None: # If operator is 'and', we have to cache a list of # IISet objects cache[cachekey] = [IISet()] return IISet() elif isinstance(s, int): s = IISet((s,)) setlist.append(s) # If we only use one key return immediately if len(setlist) == 1: result = setlist[0] if isinstance(result, int): result = IISet((result,)) if cachekey is not None: if operator == 'or': cache[cachekey] = result else: cache[cachekey] = [result] if not_parm: exclude = self._apply_not(not_parm, resultset) result = difference(result, exclude) return result if operator == 'or': # If we already get a small result set passed in, intersecting # the various indexes with it and doing the union later is # faster than creating a multiunion first. if resultset is not None and len(resultset) < 200: smalllist = [] for s in setlist: smalllist.append(intersection(resultset, s)) r = multiunion(smalllist) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist'. But # this is maybe a performance killer. So we do not cache. # if cachekey is not None: # cache[cachekey] = multiunion(setlist) else: r = multiunion(setlist) if cachekey is not None: cache[cachekey] = r else: # For intersection, sort with smallest data set first if len(setlist) > 2: setlist = sorted(setlist, key=len) # 'r' is not invariant of resultset. Thus, we # have to remember the union of 'setlist' if cachekey is not None: cache[cachekey] = setlist r = resultset for s in setlist: r = intersection(r, s) # If intersection, we can't possibly get a smaller result if not r: break if isinstance(r, int): r = IISet((r, )) if r is None: return IISet() if not_parm: exclude = self._apply_not(not_parm, resultset) r = difference(r, exclude) return r
def daterangeindex_apply_index(self, request, cid='', res=None): record = parseIndexRequest(request, self.getId()) if record.keys is None: return None term = self._convertDateTime(record.keys[0]) REQUEST = getattr(self, 'REQUEST', None) if REQUEST is not None: catalog = aq_parent(aq_parent(aq_inner(self))) if catalog is not None: key = '%s_%s'%(catalog.getId(), catalog.getCounter()) cache = REQUEST.get(key, None) tid = isinstance(term, int) and term / 10 or 'None' index_id = self.getId() if res is None: cachekey = '_daterangeindex_%s_%s' % (index_id, tid) else: cachekey = '_daterangeindex_inverse_%s_%s' % (index_id, tid) if cache is None: cache = REQUEST[key] = RequestCache() else: cached = cache.get(cachekey, None) if cached is not None: if res is None: return cached, (self._since_field, self._until_field) else: return (difference(res, cached), (self._since_field, self._until_field)) if res is None: # # Aggregate sets for each bucket separately, to avoid # large-small union penalties. # XXX Does this apply for multiunion? # until_only = multiunion(self._until_only.values(term)) since_only = multiunion(self._since_only.values(None, term)) until = multiunion(self._until.values(term)) # Total result is bound by res if REQUEST is None: until = intersection(res, until) since = multiunion(self._since.values(None, term)) bounded = intersection(until, since) result = multiunion([bounded, until_only, since_only, self._always]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return result, (self._since_field, self._until_field) else: # Compute the inverse and subtract from res until_only = multiunion(self._until_only.values(None, term - 1)) since_only = multiunion(self._since_only.values(term + 1)) until = multiunion(self._until.values(None, term - 1)) since = multiunion(self._since.values(term + 1)) result = multiunion([until_only, since_only, until, since]) if REQUEST is not None and catalog is not None: cache[cachekey] = result return difference(res, result), (self._since_field, self._until_field)
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id + '_encoding') if safe_callable(encoding): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source, encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word == last: continue last = word wordScores[word] = wordScores.get(word, 0) + 1 # Convert scores to use wids: widScores = IIBucket() getWid = lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)] = score del wordScores currentWids = IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert = self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids = widScores.keys() if wids != currentWids.keys(): self._unindex[documentId] = wids return len(wids)
from BTrees.IIBTree import IISet, union, intersection, difference def make_choice(data, per): data_len = len(data) return [choice(data) for i in range(0, data_len * float(per) / 100.0)] for max in (500, 2500, 5000, 10000, 25000, 50000, 100000): data = range(max) for p1, p2 in ((25, 25), (25, 50), (25, 75), (25, 100), (50, 50), (50, 75), (50, 100), (75, 75), (75, 100), (100, 100)): d1 = IISet(make_choice(data, p1)) d2 = IISet(make_choice(data, p2)) ts = time.time() union(d1, d2) tu = time.time() - ts ts = time.time() intersection(d1, d2) ti = time.time() - ts ts = time.time() difference(d1, d2) td = time.time() - ts print '%6d %3d:%3d %6.6f %6.6f %6.6f' % (max, p1, p2, tu, ti, td)