示例#1
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = IITreeSet(old_wid2w.keys())
        new_widset = IITreeSet(new_wid2w.keys())

        in_both_widset = intersection(old_widset, new_widset)
        only_old_widset = difference(old_widset, in_both_widset)
        only_new_widset = difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = WidCode.encode(new_wids)
        return len(new_wids)
示例#2
0
    def index_object(self, documentId, obj, threshold=None):
        """ wrapper to handle indexing of multiple attributes """
        # needed for backward compatibility
        try: fields = self._indexed_fields
        except: fields  = [ self.id ]

        res = 0
        all_wids = []
        for attr in fields:
            try:
                wids = self._index_object(documentId, obj, threshold, attr)
                if wids is not None:
                    all_wids.extend(wids)
            except:
                pass

        # get rid of words removed by reindexing
        try:
            o_wids = IISet(self._storage.getWordIdsForDocId(documentId))
        except KeyError:
            o_wids = IISet()

        all_wids_set = IISet(all_wids)
        remove_wids = difference(o_wids, all_wids_set)
        insert_wids = difference(all_wids_set, o_wids)
        insert_dict = {}   # hash wids to dict for performance reasons
        for wid in insert_wids.keys(): insert_dict[wid] = 1

        if len(remove_wids) > 0:
            self._storage.removeWordIdsForDocId(documentId, remove_wids) 
        if all_wids:
            self._storage.insert([w for w in all_wids if insert_dict.has_key(w)], documentId)
        return len(all_wids)
示例#3
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = IITreeSet(old_wid2w.keys())
        new_widset = IITreeSet(new_wid2w.keys())

        in_both_widset = intersection(old_widset, new_widset)
        only_old_widset = difference(old_widset, in_both_widset)
        only_new_widset = difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = WidCode.encode(new_wids)
        return len(new_wids)
示例#4
0
 def _update(self, documentId, val, oldval, threshold):
     val = IITreeSet(val)
     oldval = IITreeSet(self._unindexVal2Val(oldval))
     add = difference(val, oldval)
     rem = difference(oldval, val)
     if add: self._indexValue(documentId, add, threshold)
     if rem: self._unindexValue(documentId, rem)
     return len(add)
示例#5
0
 def _update(self,documentId,val,oldval,threshold):
   add= difference(val,oldval)
   rem= difference(oldval,val)
   if add: self._indexValue(documentId,add,threshold)
   if rem: self._unindexValue(documentId,rem)
   # optimize transaction size by not writing _unindex bucket
   if len(rem) < 100:
     for x in rem: oldval.remove(x) # sad that we do not have a mass remove
     oldval.update(add)
   else: oldval.clear(); oldval.update(val)
   return len(add),
示例#6
0
 def _update(self, documentId, val, oldval, threshold):
     add = difference(val, oldval)
     rem = difference(oldval, val)
     if add: self._indexValue(documentId, add, threshold)
     if rem: self._unindexValue(documentId, rem)
     # optimize transaction size by not writing _unindex bucket
     if len(rem) < 100:
         for x in rem:
             oldval.remove(x)  # sad that we do not have a mass remove
         oldval.update(add)
     else:
         oldval.clear()
         oldval.update(val)
     return len(add),
示例#7
0
    def query_index(self, record, resultset=None):
        index = self._index
        indexed = self._index_value

        for key in record.keys:
            if bool(key) is bool(indexed):
                # If we match the indexed value, check index
                return intersection(index, resultset)
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return union(difference(self._unindex, index), IISet([]))
                else:
                    return difference(resultset, index)
        return IISet()
    def query_index(self, record, resultset=None):
        index = self._index
        indexed = self._index_value

        for key in record.keys:
            if bool(key) is bool(indexed):
                # If we match the indexed value, check index
                return intersection(index, resultset)
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return union(difference(self._unindex, index), IISet([]))
                else:
                    return difference(resultset, index)
        return IISet()
        def query_index(self, record, resultset=None):
            level = record.get("level", 0)
            operator = record.get('operator', self.useOperator).lower()
            depth = getattr(record, 'depth', -1)  # use getattr to get 0 value
            navtree = record.get('navtree', 0)
            navtree_start = record.get('navtree_start', 0)
            exclude_root = record.get('exclude_root', 0)

            # depending on the operator we use intersection of union
            if operator == "or":
                set_func = union
            else:
                set_func = intersection

            result = None
            for k in record.keys:
                rows = self.search(k,
                                   level,
                                   depth,
                                   navtree,
                                   navtree_start,
                                   resultset=resultset)
                if exclude_root:
                    root = self._index_items.get(k)
                    rows = difference(rows, root)
                result = set_func(result, rows)

            if result:
                return result
            return IISet()
示例#10
0
文件: debug.py 项目: eea/eea.versions
 def missing_entries_for_index(self, catalog, index_name):
     """ Return the difference between catalog and index ids
     """
     index = catalog._catalog.getIndex(index_name)
     referenced = IISet(index.referencedObjects())
     return (difference(IISet(catalog._catalog.paths),
                        referenced), len(catalog) - len(referenced))
示例#11
0
文件: ranking.py 项目: a25kk/stv2
 def generate(seq, vqs, mv):
   if not vqs: yield 0, seq; return
   vqs = vqs[:] # avoid side effects
   v,q = vqs.pop(); mv -= v
   q = And(LiteralResultSet(seq), q)
   qr = _eval(q, cat)
   if qr:
     feed1 = generate(qr, vqs, mv)
     seq = difference(seq, qr)
   else: feed1 = None
   feed2 = seq and generate(seq, vqs, mv) or None
   def fetch1():
     if feed1 is None: return None
     try: val, subseq = feed1.next(); return val + v, subseq
     except StopIteration: return None
   def fetch2():
     if feed2 is None: return None
     try: return feed2.next()
     except StopIteration: return None
   g1 = fetch1()
   # largest value from "feed1" only
   while g1 is not None and g1[0] > mv: yield g1; g1 = fetch1()
   # merge largest values from "feed1" and "feed2"
   g2 = fetch2()
   while g1 is not None and g2 is not None:
     v1 = g1[0]; v2 = g2[0]
     if v1 > v2: yield g1; g1 = fetch1()
     elif v2 > v1: yield g2; g2 = fetch2()
     # Note: g1[1] was copied (by the "intersection" above); therfore,
     #  we can destructively change it
     else: g1[1].update(g2[1]); yield g1; g1 = fetch1(); g2 = fetch2()
   # handle feed1
   while g1 is not None: yield g1; g1 = fetch1()
   # handle feed2
   while g2 is not None: yield g2; g2 = fetch2()
        def query_index(self, record, resultset=None):
            level = record.get("level", 0)
            operator = record.get('operator', self.useOperator).lower()
            depth = getattr(record, 'depth', -1)  # use getattr to get 0 value
            navtree = record.get('navtree', 0)
            navtree_start = record.get('navtree_start', 0)
            exclude_root = record.get('exclude_root', 0)

            # depending on the operator we use intersection of union
            if operator == "or":
                set_func = union
            else:
                set_func = intersection

            result = None
            for k in record.keys:
                rows = self.search(k, level, depth, navtree, navtree_start,
                                   resultset=resultset)
                if exclude_root:
                    root = self._index_items.get(k)
                    rows = difference(rows, root)
                result = set_func(result, rows)

            if result:
                return result
            return IISet()
示例#13
0
    def group(self, seq):
        sortIndex = self._sortIndex
        sortReverse = self._sortReverse
        ns = len(seq)
        ni = len(sortIndex)
        if ns >= 0.1 * ni:
            # result large compared to index -- sort via index
            handled = IISet()
            hn = 0
            _load = getattr(sortIndex, '_load', None)
            if _load is None:
                # not an optimized index
                items = sortIndex.items()

                _load = lambda (x1, x2): x2
                if sortReverse: items.reverse()
            elif sortReverse:
                gRO = getattr(sortIndex, 'getReverseOrder', None)
                items = gRO and gRO()
                if items is None:
                    items = list(sortIndex._index.keys())
                    items.reverse()
            else:
                items = sortIndex._index.keys()
            for i in items:
                ids = intersection(seq, _load(i))
                if ids:
                    handled.update(ids)
                    hn += len(ids)
                    yield i, ids
            if hn != len(seq): yield None, difference(seq, handled)
        else:
            # result relatively small -- sort via result
            m = OOBTree()
            keyFor = getattr(sortIndex, 'keyForDocument', None)
            # work around "nogopip" bug: it defines "keyForDocument" as an integer
            if not callable(keyFor):
                # this will fail, when the index neither defines a reasonable
                # "keyForDocument" nor "documentToKeyMap". In this case,
                # the index cannot be used for sorting.
                keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc]
            noValue = IITreeSet()
            for doc in seq.keys():
                try:
                    k = keyFor(doc)
                except KeyError:
                    noValue.insert(doc)
                    continue
                l = m.get(k)
                if l is None: l = m[k] = IITreeSet()
                l.insert(doc)
            items = m.items()
            if sortReverse:
                items = list(items)
                items.reverse()
            for i in items:
                yield i
            if noValue: yield None, noValue
示例#14
0
 def missing_entries_for_index(self, catalog, index_name):
     """ Return the difference between catalog and index ids
     """
     index = catalog._catalog.getIndex(index_name)
     referenced = IISet(index.referencedObjects())
     return (
         difference(IISet(catalog._catalog.paths), referenced),
         len(catalog) - len(referenced)
     )
示例#15
0
文件: ranking.py 项目: a25kk/stv2
 def _group(self, seq):
   spec = self._spec; cat = self._cat
   vqs = spec._getValueQuerySequence()
   for i in xrange(len(vqs)-1,-1,-1):
     v,q = vqs[i]
     q = And(LiteralResultSet(seq), q)
     qr = _eval(q, cat)
     if qr: yield v, qr; seq = difference(seq, qr)
     if not seq: return
   yield 0, seq
示例#16
0
    def _apply_index(self, request, resultset=None):
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index

        for key in record.keys:
            if key:
                # If True, check index
                return (intersection(index, resultset), (self.id, ))
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return (union(difference(self._unindex, index),
                                  IISet([])), (self.id, ))
                else:
                    return (difference(resultset, index), (self.id, ))
        return (IISet(), (self.id, ))
示例#17
0
    def _apply_index(self, request, resultset=None):
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        indexed = self._index_value

        for key in record.keys:
            if bool(key) is bool(indexed):
                # If we match the indexed value, check index
                return (intersection(index, resultset), (self.id,))
            else:
                # Otherwise, remove from resultset or _unindex
                if resultset is None:
                    return (union(difference(self._unindex, index), IISet([])), (self.id,))
                else:
                    return (difference(resultset, index), (self.id,))
        return (IISet(), (self.id,))
示例#18
0
    def query_index(self, record, resultset=None):
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record, resultset)
            cached = cache.get(cachekey, None)
            if cached is not None:
                if resultset is None:
                    return cached
                else:
                    return difference(resultset, cached)

        term = self._convertDateTime(record.keys[0])
        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))
            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion(
                [bounded, until_only, since_only, self._always])
            if cache is not None:
                cache[cachekey] = result

            return result
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([since, since_only, until_only, until])
            if cache is not None:
                cache[cachekey] = result

            return difference(resultset, result)
    def query_index(self, record, resultset=None):
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record, resultset)
            cached = cache.get(cachekey, None)
            if cached is not None:
                if resultset is None:
                    return cached
                else:
                    return difference(resultset, cached)

        term = self._convertDateTime(record.keys[0])
        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))
            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion([bounded, until_only, since_only,
                                 self._always])
            if cache is not None:
                cache[cachekey] = result

            return result
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([since, since_only, until_only, until])
            if cache is not None:
                cache[cachekey] = result

            return difference(resultset, result)
示例#20
0
 def _eval(self,context):
   csq = self._classifySubqueries()
   if csq['empty']: return IISet() # empty result
   nsq = csq['lookup'] + csq['complex'] + csq['indexed']
   notsq = csq['notQ']
   if not nsq and not notsq:
     # an empty 'And' query
     return context._getObjectIds()
   if not nsq: nsq.append(notsq.pop())
   r = None
   for q in nsq: r = intersection(r, q._eval(context))
   for q in notsq: r = difference(r, q._query._eval(context))
   return r
示例#21
0
 def _group(self, seq):
     spec = self._spec
     cat = self._cat
     vqs = spec._getValueQuerySequence()
     for i in xrange(len(vqs) - 1, -1, -1):
         v, q = vqs[i]
         q = And(LiteralResultSet(seq), q)
         qr = _eval(q, cat)
         if qr:
             yield v, qr
             seq = difference(seq, qr)
         if not seq: return
     yield 0, seq
    def timing(self, small, large):
        new = 0.0
        old = 0.0
        c = 0.0
        loop = LOOP
        for i in xrange(10):
            start = time()
            difference(small, large)
            old+=(time()-start)

            start = time()
            difference2(small, large)
            new+=(time()-start)

            if ciidifference is not None:
                start = time()
                ciidifference(small, large)
                c+=(time()-start)

        print 'Old x%s: %.6f' % (loop, old)
        print 'New x%s: %.6f' % (loop, new)
        if ciidifference is not None:
            print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, old / c)
示例#23
0
    def timing(self, small, large):
        new = 0.0
        old = 0.0
        c = 0.0
        loop = LOOP
        for i in xrange(10):
            start = time()
            difference(small, large)
            old += (time() - start)

            start = time()
            difference2(small, large)
            new += (time() - start)

            if ciidifference is not None:
                start = time()
                ciidifference(small, large)
                c += (time() - start)

        print 'Old x%s: %.6f' % (loop, old)
        print 'New x%s: %.6f' % (loop, new)
        if ciidifference is not None:
            print 'Cyt x%s: %.6f - factor: %.2f' % (loop, c, old / c)
示例#24
0
    def index_object(self, document_id, obj, threshold=None):
        """Index an object.

        'document_id' is the integer ID of the document.
        'obj' is the object to be indexed.
        'threshold' is the number of words to process between committing
        subtransactions.  If None, subtransactions are disabled.
        """
        new_ranges = self._get_object_data(obj, self.id)
        if new_ranges:
            new_set = IISet(map(self.__index_range, new_ranges))
        else:
            new_set = IISet()

        old_set = self._unindex.get(document_id, IISet())

        new_entries = difference(new_set, old_set)
        expired_entries = difference(old_set, new_set)

        if not (new_entries or expired_entries):
            # nothing to do, bail out !
            return 0
        for expired_entry in expired_entries:
            self.__remove_in_index_set(self._unindex, document_id,
                expired_entry)
            if self.__remove_in_index_set(self._index, expired_entry, \
                    document_id):
                # range is not used anymore, retire it
                self.__unindex_range(expired_entry)

        for new_entry in new_entries:
            if self.__insert_in_index_set(self._unindex, document_id,
                    new_entry):
                self._length.change(1)
            self.__insert_in_index_set(self._index, new_entry, document_id)

        return 1
示例#25
0
 def _eval(self, context):
     csq = self._classifySubqueries()
     if csq['empty']: return IISet()  # empty result
     nsq = csq['lookup'] + csq['complex'] + csq['indexed']
     notsq = csq['notQ']
     if not nsq and not notsq:
         # an empty 'And' query
         return context._getObjectIds()
     if not nsq: nsq.append(notsq.pop())
     r = None
     for q in nsq:
         r = intersection(r, q._eval(context))
     for q in notsq:
         r = difference(r, q._query._eval(context))
     return r
示例#26
0
    def below(self, arg):
        """Find all resources at or below path, within the limits given.
        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        parts = path.split(os.sep)
        rids = None
        for level in range(len(parts)):
            rids = intersection(rids, self.parts[(level, parts[level])])
        if rids is None:
            return IISet() # short-cut


        # Limits
        # ======
        # Remove rids that are above any upper limit, and then only include rids
        # that are above any lower limit. Limits are relative to the level of
        # the requested path.

        if upper is not None:
            upper += level
            for i in range(level, upper):
                if i not in self.levels:
                    break
                rids = difference(rids, self.levels[i])
        if lower is not None:
            lower += level
            _rids = []
            for i in range(level, lower):
                if i not in self.levels:
                    break
                _rids.append(self.levels[i])
            rids = intersection(rids, multiunion(_rids))

        return rids
示例#27
0
文件: sorting.py 项目: Vinsurya/Plone
 def group(self, seq):
   sortIndex = self._sortIndex; sortReverse = self._sortReverse
   ns = len(seq); ni = len(sortIndex)
   if ns >= 0.1 * ni:
     # result large compared to index -- sort via index
     handled = IISet(); hn = 0
     _load = getattr(sortIndex, '_load', None)
     if _load is None:
       # not an optimized index
       items = sortIndex.items()
       
       _load = lambda (x1, x2): x2
       if sortReverse: items.reverse()
     elif sortReverse:
       gRO = getattr(sortIndex, 'getReverseOrder', None)
       items = gRO and gRO()
       if items is None:
         items = list(sortIndex._index.keys()); items.reverse()
     else: items = sortIndex._index.keys()
     for i in items:
       ids = intersection(seq, _load(i))
       if ids:
         handled.update(ids); hn += len(ids)
         yield i, ids
     if hn != len(seq): yield None, difference(seq, handled)
   else:
     # result relatively small -- sort via result
     m = OOBTree()
     keyFor = getattr(sortIndex, 'keyForDocument', None)
     # work around "nogopip" bug: it defines "keyForDocument" as an integer
     if not callable(keyFor):
       # this will fail, when the index neither defines a reasonable
       # "keyForDocument" nor "documentToKeyMap". In this case,
       # the index cannot be used for sorting.
       keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc]
     noValue = IITreeSet()
     for doc in seq.keys():
       try: k = keyFor(doc)
       except KeyError: noValue.insert(doc); continue
       l = m.get(k)
       if l is None: l = m[k] = IITreeSet()
       l.insert(doc)
     items = m.items()
     if sortReverse: items = list(items); items.reverse()
     for i in items: yield i
     if noValue: yield None, noValue
示例#28
0
        def testQuery(record, expect=1):
            cache = idx.getRequestCache()
            cache.clear()

            # First query
            res1 = idx._apply_index(record)

            # Cache set?
            self.assertEqual(cache._sets, expect)

            # Cache miss?
            self.assertEqual(cache._misses, expect)

            # Second Query
            res2 = idx._apply_index(record)

            # Cache hit?
            self.assertEqual(cache._hits, expect)

            # Check if result of second query is equal to first query
            result = difference(res1[0], res2[0])
            self.assertEqual(len(result), 0)
示例#29
0
文件: sorting.py 项目: a25kk/stv2
 def group(self, seq):
   sortIndex = self._sortIndex; sortReverse = self._sortReverse
   ns = len(seq); ni = len(sortIndex)
   if ns >= 0.1 * ni:
     # result large compared to index -- sort via index
     handled = IISet(); hn = 0
     _load = getattr(sortIndex, '_load', None)
     if _load is None:
       # not an optimized index
       items = sortIndex.items()
       
       _load = lambda (x1, x2): x2
       if sortReverse: items.reverse()
     elif sortReverse:
       gRO = getattr(sortIndex, 'getReverseOrder', None)
       items = gRO and gRO()
       if items is None:
         items = list(sortIndex._index.keys()); items.reverse()
     else: items = sortIndex._index.keys()
     for i in items:
       ids = intersection(seq, _load(i))
       if ids:
         handled.update(ids); hn += len(ids)
         yield i, ids
     if hn != len(seq): yield None, difference(seq, handled)
   else:
     # result relatively small -- sort via result
     keyFor = sortIndex.keyForDocument; m = OOBTree()
     noValue = IITreeSet()
     for doc in seq.keys():
       try: k = keyFor(doc)
       except KeyError: noValue.insert(doc); continue
       l = m.get(k)
       if l is None: l = m[k] = IITreeSet()
       l.insert(doc)
     items = m.items()
     if sortReverse: items = list(items); items.reverse()
     for i in items: yield i
     if noValue: yield None, noValue
示例#30
0
 def executeQuery(self, index):
     L = []
     Nots = []
     for subnode in self.getValue():
         if subnode.nodeType() == "NOT":
             r = subnode.getValue().executeQuery(index)
             # If None, technically it matches every doc, but we treat
             # it as if it matched none (we want
             #     real_word AND NOT stop_word
             # to act like plain real_word).
             if r is not None:
                 Nots.append((r, 1))
         else:
             r = subnode.executeQuery(index)
             # If None, technically it matches every doc, so needn't be
             # included.
             if r is not None:
                 L.append((r, 1))
     set = mass_weightedIntersection(L)
     if Nots:
         notset = mass_weightedUnion(Nots)
         set = difference(set, notset)
     return set
示例#31
0
 def executeQuery(self, index):
     L = []
     Nots = []
     for subnode in self.getValue():
         if subnode.nodeType() == "NOT":
             r = subnode.getValue().executeQuery(index)
             # If None, technically it matches every doc, but we treat
             # it as if it matched none (we want
             #     real_word AND NOT stop_word
             # to act like plain real_word).
             if r is not None:
                 Nots.append((r, 1))
         else:
             r = subnode.executeQuery(index)
             # If None, technically it matches every doc, so needn't be
             # included.
             if r is not None:
                 L.append((r, 1))
     set = mass_weightedIntersection(L)
     if Nots:
         notset = mass_weightedUnion(Nots)
         set = difference(set, notset)
     return set
示例#32
0
    def insertDocument(self, docid, widlist):

        if not self._doc2wid.has_key(docid):
            self._length.change(1)

        enc_widlist = encode(widlist)
        old_enc_widlist = self._doc2wid.get(docid)
        if old_enc_widlist is not None:
            old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance

        removed_wordids = []
        if old_enc_widlist != enc_widlist :
            self._doc2wid[docid] = _PS(enc_widlist)
            if old_enc_widlist is not None:
                old_widlist = IISet(decode(old_enc_widlist))
                removed_wordids = difference(old_widlist, IISet(widlist))

        tree = self._wid2doc
        tree_has = tree.has_key
        count = 0
        for wid in widlist:
            count += 1
            if not tree_has(wid):
                tree[wid] = DocidList([docid])
            else:
                if not docid in tree[wid]:   
                    tree[wid].insert(docid)

        for wid in removed_wordids:
            if tree_has(wid):
                try:
                    tree[wid].remove(docid)
                except KeyError:
                    pass

        self._docweight[docid] = count
示例#33
0
    def insertDocument(self, docid, widlist):

        if not self._doc2wid.has_key(docid):
            self._length.change(1)

        enc_widlist = encode(widlist)
        old_enc_widlist = self._doc2wid.get(docid)
        if old_enc_widlist is not None:
            old_enc_widlist = old_enc_widlist.get()  # unwrap _PS instance

        removed_wordids = []
        if old_enc_widlist != enc_widlist:
            self._doc2wid[docid] = _PS(enc_widlist)
            if old_enc_widlist is not None:
                old_widlist = IISet(decode(old_enc_widlist))
                removed_wordids = difference(old_widlist, IISet(widlist))

        tree = self._wid2doc
        tree_has = tree.has_key
        count = 0
        for wid in widlist:
            count += 1
            if not tree_has(wid):
                tree[wid] = DocidList([docid])
            else:
                if not docid in tree[wid]:
                    tree[wid].insert(docid)

        for wid in removed_wordids:
            if tree_has(wid):
                try:
                    tree[wid].remove(docid)
                except KeyError:
                    pass

        self._docweight[docid] = count
示例#34
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the request arg.

        The request argument should be a mapping object.

        If the request does not have a key which matches the "id" of
        the index instance, then None is returned.

        If the request *does* have a key which matches the "id" of
        the index instance, one of a few things can happen:

          - if the value is a blank string, None is returned (in
            order to support requests from web forms where
            you can't tell a blank string from empty).

          - if the value is a nonblank string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are "or"
            and "and".

        If None is not returned as a result of the abovementioned
        constraints, two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.

        FAQ answer:  to search a Field Index for documents that
        have a blank string as their value, wrap the request value
        up in a tuple ala: request = {'id':('',)}
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)
        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = map(self._convert, not_parm)
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = map(self._convert, record.keys)

        # experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % escape(operator))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id,)

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                r = multiunion(tmp)
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp
                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    raise TypeError('None cannot be in an index.')
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can't possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    return IISet(), (self.id,)
                elif isinstance(s, int):
                    s = IISet((s,))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id,)

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.
                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)
                else:
                    r = multiunion(setlist)
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)
                r = resultset
                for s in setlist:
                    r = intersection(r, s)

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet(), (self.id,)
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r, (self.id,)
示例#35
0
    def index_object(self, documentId, obj, threshold=None):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.

           o Repeat by recurdef - a RFC2445 reccurence definition string

        """
        returnStatus = 0

        try:
            date_attr = getattr(obj, self.id)
            if safe_callable(date_attr):
                date_attr = date_attr()
        except AttributeError:
            return returnStatus

        recurdef = getattr(obj, self.attr_recurdef, None)
        if safe_callable(recurdef):
            recurdef = recurdef()

        if not recurdef:
            dates = [pydt(date_attr)]
        else:
            until = getattr(obj, self.attr_until, None)
            if safe_callable(until):
                until = until()

            dates = recurrence_sequence_ical(date_attr,
                                             recrule=recurdef,
                                             until=until)

        newvalues = IISet(map(dt2int, dates))
        oldvalues = self._unindex.get(documentId, _marker)
        if oldvalues is not _marker:
            oldvalues = IISet(oldvalues)

        if oldvalues is not _marker and newvalues is not _marker\
            and not difference(newvalues, oldvalues)\
            and not difference(oldvalues, newvalues):
            # difference is calculated relative to first argument, so we have to
            # use it twice here
            return returnStatus

        if oldvalues is not _marker:
            for oldvalue in oldvalues:
                self.removeForwardIndexEntry(oldvalue, documentId)
            if newvalues is _marker:
                try:
                    del self._unindex[documentId]
                except ConflictError:
                    raise
                except:
                    LOG.error("Should not happen: oldvalues was there,"
                              " now it's not, for document with id %s" %
                              documentId)

        if newvalues is not _marker:
            inserted = False
            for value in newvalues:
                self.insertForwardIndexEntry(value, documentId)
                inserted = True
            if inserted:
                # store tuple values in reverse index entries for sorting
                self._unindex[documentId] = tuple(newvalues)
                returnStatus = 1

        return returnStatus
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id+'_encoding')
            if safe_callable(encoding ):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source,encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word==last: continue
                last=word
                wordScores[word]=wordScores.get(word,0)+1

        # Convert scores to use wids:
        widScores=IIBucket()
        getWid=lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)]=score

        del wordScores

        currentWids=IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert=self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids=widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId]=wids

        return len(wids)
示例#37
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the request arg.

        The request argument should be a mapping object.

        If the request does not have a key which matches the "id" of
        the index instance, then None is returned.

        If the request *does* have a key which matches the "id" of
        the index instance, one of a few things can happen:

          - if the value is a blank string, None is returned (in
            order to support requests from web forms where
            you can't tell a blank string from empty).

          - if the value is a nonblank string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are "or"
            and "and".

        If None is not returned as a result of the abovementioned
        constraints, two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.

        FAQ answer:  to search a Field Index for documents that
        have a blank string as their value, wrap the request value
        up in a tuple ala: request = {'id':('',)}
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)
        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = map(self._convert, not_parm)
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = map(self._convert, record.keys)

        # experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % escape(operator))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id, )

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                r = multiunion(tmp)
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp
                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can't possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    return IISet(), (self.id, )
                elif isinstance(s, int):
                    s = IISet((s, ))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id, )

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.
                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)
                else:
                    r = multiunion(setlist)
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)
                r = resultset
                for s in setlist:
                    r = intersection(r, s)

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet(), (self.id, )
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r, (self.id, )
示例#38
0
    def query_index(self, record, resultset=None):
        """Search the index with the given IndexQuery object.

        If the query has a key which matches the 'id' of
        the index instance, one of a few things can happen:

          - if the value is a string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are 'or'
            and 'and'.
        """
        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)

        operator = record.operator

        cachekey = None
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record)
            if cachekey is not None:
                cached = None
                if operator == 'or':
                    cached = cache.get(cachekey, None)
                else:
                    cached_setlist = cache.get(cachekey, None)
                    if cached_setlist is not None:
                        r = resultset
                        for s in cached_setlist:
                            # the result is bound by the resultset
                            r = intersection(r, s)
                            # If intersection, we can't possibly get a
                            # smaller result
                            if not r:
                                break
                        cached = r

                if cached is not None:
                    if isinstance(cached, int):
                        cached = IISet((cached, ))

                    if not_parm:
                        not_parm = list(map(self._convert, not_parm))
                        exclude = self._apply_not(not_parm, resultset)
                        cached = difference(cached, exclude)

                    return cached

        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = list(map(self._convert, not_parm))
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = list(map(self._convert, record.keys))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                r = multiunion(tmp)

                if cachekey is not None:
                    cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp

                # 'r' is not invariant of resultset. Thus, we
                # have to remember 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    # Prevent None from being looked up. None doesn't
                    # have a valid ordering definition compared to any
                    # other object. BTrees 4.0+ will throw a TypeError
                    # "object has default comparison".
                    continue
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    if cachekey is not None:
                        # If operator is 'and', we have to cache a list of
                        # IISet objects
                        cache[cachekey] = [IISet()]
                    return IISet()
                elif isinstance(s, int):
                    s = IISet((s, ))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.

                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)

                    # 'r' is not invariant of resultset.  Thus, we
                    # have to remember the union of 'setlist'. But
                    # this is maybe a performance killer. So we do not cache.
                    # if cachekey is not None:
                    #    cache[cachekey] = multiunion(setlist)

                else:
                    r = multiunion(setlist)
                    if cachekey is not None:
                        cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)

                # 'r' is not invariant of resultset. Thus, we
                # have to remember the union of 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet()
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r
示例#39
0
        def generate(seq, vqs, mv):
            if not vqs:
                yield 0, seq
                return
            vqs = vqs[:]  # avoid side effects
            v, q = vqs.pop()
            mv -= v
            q = And(LiteralResultSet(seq), q)
            qr = _eval(q, cat)
            if qr:
                feed1 = generate(qr, vqs, mv)
                seq = difference(seq, qr)
            else:
                feed1 = None
            feed2 = seq and generate(seq, vqs, mv) or None

            def fetch1():
                if feed1 is None: return None
                try:
                    val, subseq = feed1.next()
                    return val + v, subseq
                except StopIteration:
                    return None

            def fetch2():
                if feed2 is None: return None
                try:
                    return feed2.next()
                except StopIteration:
                    return None

            g1 = fetch1()
            # largest value from "feed1" only
            while g1 is not None and g1[0] > mv:
                yield g1
                g1 = fetch1()
            # merge largest values from "feed1" and "feed2"
            g2 = fetch2()
            while g1 is not None and g2 is not None:
                v1 = g1[0]
                v2 = g2[0]
                if v1 > v2:
                    yield g1
                    g1 = fetch1()
                elif v2 > v1:
                    yield g2
                    g2 = fetch2()
                    # Note: g1[1] was copied (by the "intersection" above); therfore,
                    #  we can destructively change it
                else:
                    g1[1].update(g2[1])
                    yield g1
                    g1 = fetch1()
                    g2 = fetch2()
            # handle feed1
            while g1 is not None:
                yield g1
                g1 = fetch1()
            # handle feed2
            while g2 is not None:
                yield g2
                g2 = fetch2()
示例#40
0
def inverseResultSet(all_docids, set):
    """ perform difference between all docids and a resultset """
    docids = difference(DocidList(all_docids), set.getDocids())
    return ResultSet(docids, set.getWords())
示例#41
0
 def _eval(self, context):
     return difference(context._getObjectIds(), self._query._eval(context))
示例#42
0
def inverseResultSet(all_docids, set):
    """ perform difference between all docids and a resultset """
    docids = difference(DocidList(all_docids), set.getDocids())
    return ResultSet(docids, set.getWords())
示例#43
0
 def difference(self, *args):
     from BTrees.IIBTree import difference
     return difference(*args)
    def index_object(self, documentId, obj, threshold=None):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.

           o Repeat by recurdef - a RFC2445 reccurence definition string

        """
        returnStatus = 0

        try:
            date_attr = getattr(obj, self.id)
            if safe_callable(date_attr):
                date_attr = date_attr()
        except AttributeError:
            return returnStatus

        recurdef = getattr(obj, self.attr_recurdef, None)
        if safe_callable(recurdef):
            recurdef = recurdef()

        if not recurdef:
            dates = [pydt(date_attr)]
        else:
            until = getattr(obj, self.attr_until, None)
            if safe_callable(until):
                until = until()

            dates = recurrence_sequence_ical(
                date_attr, recrule=recurdef, until=until)

        newvalues = IISet(map(dt2int, dates))
        oldvalues = self._unindex.get(documentId, _marker)
        if oldvalues is not _marker:
            oldvalues = IISet(oldvalues)

        if oldvalues is not _marker and newvalues is not _marker\
                and not difference(newvalues, oldvalues)\
                and not difference(oldvalues, newvalues):
            # difference is calculated relative to first argument, so we have
            # to use it twice here
            return returnStatus

        if oldvalues is not _marker:
            for oldvalue in oldvalues:
                self.removeForwardIndexEntry(oldvalue, documentId)
            if newvalues is _marker:
                try:
                    del self._unindex[documentId]
                except ConflictError:
                    raise
                except Exception:
                    LOG.error("Should not happen: oldvalues was there,"
                              " now it's not, for document with id %s" %
                              documentId)

        if newvalues is not _marker:
            inserted = False
            for value in newvalues:
                self.insertForwardIndexEntry(value, documentId)
                inserted = True
            if inserted:
                # store tuple values in reverse index entries for sorting
                self._unindex[documentId] = tuple(newvalues)
                returnStatus = 1

        if returnStatus > 0:
            self._increment_counter()

        return returnStatus
示例#45
0
 def difference(self, *args):
     from BTrees.IIBTree import difference
     return difference(*args)
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in 'request', which
        should be a mapping object.

        If the request does not contain the needed parameters, then
        return None.

        Otherwise return two objects.  The first object is a ResultSet
        containing the record numbers of the matching records.  The
        second object is a tuple containing the names of all data fields
        used.
        """
        iid = self.id
        record = parseIndexRequest(request, iid, self.query_options)
        if record.keys is None:
            return None

        term = self._convertDateTime(record.keys[0])
        REQUEST = aq_get(self, 'REQUEST', None)
        if REQUEST is not None:
            catalog = aq_parent(aq_parent(aq_inner(self)))
            if catalog is not None:
                key = self._cache_key(catalog)
                cache = REQUEST.get(key, None)
                tid = isinstance(term, int) and term / 10 or 'None'
                if resultset is None:
                    cachekey = '_daterangeindex_%s_%s' % (iid, tid)
                else:
                    cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid)
                if cache is None:
                    cache = REQUEST[key] = RequestCache()
                else:
                    cached = cache.get(cachekey, None)
                    if cached is not None:
                        if resultset is None:
                            return (cached,
                                    (self._since_field, self._until_field))
                        else:
                            return (difference(resultset, cached),
                                    (self._since_field, self._until_field))

        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))

            # Total result is bound by resultset
            if REQUEST is None:
                until = intersection(resultset, until)

            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion([bounded, until_only, since_only,
                                 self._always])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (result, (self._since_field, self._until_field))
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([since, since_only, until_only, until])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (difference(resultset, result),
                    (self._since_field, self._until_field))
示例#47
0
 def and_not(self, x):
     return self.__class__(
         difference(self._dict, x._dict),
         self._words,
         self._index,
         )
示例#48
0
def checkCatalog(path,indexes):
    """ perform some consistency checks on a ZCatalog instance"""

    root = Zope2.app()

    try:
        catalog = root.unrestrictedTraverse(path)
    except AttributeError:
        print 'Error: catalog object not found'
        sys.exit(1)

    # get Catalog instance
    _cat = catalog._catalog

    # check Catalog internal BTrees
    l_data  = list(_cat.data.keys())
    l_data.sort()
    l_uids  = list(_cat.uids.values())
    l_uids.sort()
    l_paths = list(_cat.data.keys())
    l_paths.sort()

    print "Checking catalog internal BTrees"
    print "\tINFO: Mapping data:  %d entries" % len(l_data)
    print "\tINFO: Mapping uids:  %d entries" % len(l_uids)
    print "\tINFO: Mapping paths: %d entries" % len(l_paths)

    if l_data == l_uids:
        print "\tOK:  Mapping data equals Mapping uids"
    else:
        print "\tERR: Mapping data does not equal Mapping uids"

    if l_data == l_paths:
        print "\tOK:  Mapping data equals Maaping paths"
    else:
        print "\tERR: Mapping data does not equal Maaping paths"


    # check BTrees of indexes

    for id,idx in _cat.indexes.items():

        if indexes and not idx.meta_type in indexes: continue

        print "Checking index '%s' (type: %s)" % (id, idx.meta_type)

        if idx.meta_type in ['FieldIndex','KeywordIndex']:

            # check forward entries
            RIDS = IISet()
            for key, rids in idx._index.items():
                if isinstance(rids,IntType):
                    RIDS.insert(  rids  )
                else:
                    map(RIDS.insert , rids.keys())

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff)!=0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)'  % (len(RIDS))


        elif idx.meta_type in ['PathIndex']:

            RIDS = IISet()

            for rids in map(None,idx._index.values()):
                map(RIDS.insert , rids.values()[0])

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff)!=0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)'  % (len(RIDS))


        if idx.meta_type in ['FieldIndex','KeywordIndex','PathIndex']:

            # check backward entries
            RIDS = IISet(idx._unindex.keys())
            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff)!=0:
                print '\tERR: Problem with backward entries'
                print '\tERR: too much backward entries:', diff
            else:
                print '\tOK:  Backward entries (%d entries)'  % (len(RIDS))
示例#49
0
    def _apply_index(self, request, resultset=None):
        """
            Apply the index to query parameters given in 'request', which
            should be a mapping object.

            If the request does not contain the needed parameters, then
            return None.

            Otherwise return two objects.  The first object is a ResultSet
            containing the record numbers of the matching records.  The
            second object is a tuple containing the names of all data fields
            used.
        """
        iid = self.id
        record = parseIndexRequest(request, iid, self.query_options)
        if record.keys is None:
            return None

        term = self._convertDateTime(record.keys[0])
        REQUEST = aq_get(self, 'REQUEST', None)
        if REQUEST is not None:
            catalog = aq_parent(aq_parent(aq_inner(self)))
            if catalog is not None:
                key = self._cache_key(catalog)
                cache = REQUEST.get(key, None)
                tid = isinstance(term, int) and term / 10 or 'None'
                if resultset is None:
                    cachekey = '_daterangeindex_%s_%s' % (iid, tid)
                else:
                    cachekey = '_daterangeindex_inverse_%s_%s' % (iid, tid)
                if cache is None:
                    cache = REQUEST[key] = RequestCache()
                else:
                    cached = cache.get(cachekey, None)
                    if cached is not None:
                        if resultset is None:
                            return (cached, (self._since_field,
                                             self._until_field))
                        else:
                            return (difference(resultset, cached),
                                    (self._since_field, self._until_field))

        if resultset is None:
            # Aggregate sets for each bucket separately, to avoid
            # large-small union penalties.
            until_only = multiunion(self._until_only.values(term))
            since_only = multiunion(self._since_only.values(None, term))
            until = multiunion(self._until.values(term))

            # Total result is bound by resultset
            if REQUEST is None:
                until = intersection(resultset, until)

            since = multiunion(self._since.values(None, term))
            bounded = intersection(until, since)

            # Merge from smallest to largest.
            result = multiunion(
                [bounded, until_only, since_only, self._always])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (result, (self._since_field, self._until_field))
        else:
            # Compute the inverse and subtract from res
            until_only = multiunion(self._until_only.values(None, term - 1))
            since_only = multiunion(self._since_only.values(term + 1))
            until = multiunion(self._until.values(None, term - 1))
            since = multiunion(self._since.values(term + 1))

            result = multiunion([until_only, since_only, until, since])
            if REQUEST is not None and catalog is not None:
                cache[cachekey] = result

            return (difference(resultset,
                               result), (self._since_field, self._until_field))
示例#50
0
 def _eval(self, context):
     return difference(context._getObjectIds(), self._query._eval(context))
def daterangeindex_apply_index(self, request, cid='', res=None):
    record = parseIndexRequest(request, self.getId())
    if record.keys is None:
        return None

    term = self._convertDateTime(record.keys[0])

    REQUEST = getattr(self, 'REQUEST', None)
    if REQUEST is not None:
        catalog = aq_parent(aq_parent(aq_inner(self)))
        if catalog is not None:
            key = '%s_%s' % (catalog.getId(), catalog.getCounter())
            cache = REQUEST.get(key, None)
            tid = isinstance(term, int) and term / 10 or 'None'
            index_id = self.getId()
            if res is None:
                cachekey = '_daterangeindex_%s_%s' % (index_id, tid)
            else:
                cachekey = '_daterangeindex_inverse_%s_%s' % (index_id, tid)
            if cache is None:
                cache = REQUEST[key] = RequestCache()
            else:
                cached = cache.get(cachekey, None)
                if cached is not None:
                    if res is None:
                        return cached, (self._since_field, self._until_field)
                    else:
                        return (difference(res, cached), (self._since_field,
                                                          self._until_field))

    if res is None:
        #
        #   Aggregate sets for each bucket separately, to avoid
        #   large-small union penalties.
        #   XXX Does this apply for multiunion?
        #
        until_only = multiunion(self._until_only.values(term))
        since_only = multiunion(self._since_only.values(None, term))
        until = multiunion(self._until.values(term))

        # Total result is bound by res
        if REQUEST is None:
            until = intersection(res, until)

        since = multiunion(self._since.values(None, term))
        bounded = intersection(until, since)
        result = multiunion([bounded, until_only, since_only, self._always])

        if REQUEST is not None and catalog is not None:
            cache[cachekey] = result

        return result, (self._since_field, self._until_field)
    else:
        # Compute the inverse and subtract from res
        until_only = multiunion(self._until_only.values(None, term - 1))
        since_only = multiunion(self._since_only.values(term + 1))
        until = multiunion(self._until.values(None, term - 1))
        since = multiunion(self._since.values(term + 1))

        result = multiunion([until_only, since_only, until, since])
        if REQUEST is not None and catalog is not None:
            cache[cachekey] = result
        return difference(res, result), (self._since_field, self._until_field)
示例#52
0
 def and_not(self, x):
     return self.__class__(
         difference(self._dict, x._dict),
         self._words,
         self._index,
     )
示例#53
0
def checkCatalog(path, indexes):
    """ perform some consistency checks on a ZCatalog instance"""

    root = Zope2.app()

    try:
        catalog = root.unrestrictedTraverse(path)
    except AttributeError:
        print 'Error: catalog object not found'
        sys.exit(1)

    # get Catalog instance
    _cat = catalog._catalog

    # check Catalog internal BTrees
    l_data = list(_cat.data.keys())
    l_data.sort()
    l_uids = list(_cat.uids.values())
    l_uids.sort()
    l_paths = list(_cat.data.keys())
    l_paths.sort()

    print "Checking catalog internal BTrees"
    print "\tINFO: Mapping data:  %d entries" % len(l_data)
    print "\tINFO: Mapping uids:  %d entries" % len(l_uids)
    print "\tINFO: Mapping paths: %d entries" % len(l_paths)

    if l_data == l_uids:
        print "\tOK:  Mapping data equals Mapping uids"
    else:
        print "\tERR: Mapping data does not equal Mapping uids"

    if l_data == l_paths:
        print "\tOK:  Mapping data equals Maaping paths"
    else:
        print "\tERR: Mapping data does not equal Maaping paths"

    # check BTrees of indexes

    for id, idx in _cat.indexes.items():

        if indexes and not idx.meta_type in indexes: continue

        print "Checking index '%s' (type: %s)" % (id, idx.meta_type)

        if idx.meta_type in ['FieldIndex', 'KeywordIndex']:

            # check forward entries
            RIDS = IISet()
            for key, rids in idx._index.items():
                if isinstance(rids, IntType):
                    RIDS.insert(rids)
                else:
                    map(RIDS.insert, rids.keys())

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)' % (len(RIDS))

        elif idx.meta_type in ['PathIndex']:

            RIDS = IISet()

            for rids in map(None, idx._index.values()):
                map(RIDS.insert, rids.values()[0])

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)' % (len(RIDS))

        if idx.meta_type in ['FieldIndex', 'KeywordIndex', 'PathIndex']:

            # check backward entries
            RIDS = IISet(idx._unindex.keys())
            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with backward entries'
                print '\tERR: too much backward entries:', diff
            else:
                print '\tOK:  Backward entries (%d entries)' % (len(RIDS))
示例#54
0
    def query_index(self, record, resultset=None):
        """Search the index with the given IndexQuery object.

        If not `None`, the resultset argument
        indicates that the search result is relevant only on this set,
        i.e. everything outside resultset is of no importance.
        The index can use this information for optimizations.
        """
        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)

        operator = record.operator

        cachekey = None
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record)
            if cachekey is not None:
                cached = None
                if operator == 'or':
                    cached = cache.get(cachekey, None)
                else:
                    cached_setlist = cache.get(cachekey, None)
                    if cached_setlist is not None:
                        r = resultset
                        for s in cached_setlist:
                            # the result is bound by the resultset
                            r = intersection(r, s)
                            # If intersection, we can't possibly get a
                            # smaller result
                            if not r:
                                break
                        cached = r

                if cached is not None:
                    if isinstance(cached, int):
                        cached = IISet((cached, ))

                    if not_parm:
                        not_parm = list(map(self._convert, not_parm))
                        exclude = self._apply_not(not_parm, resultset)
                        cached = difference(cached, exclude)

                    return cached

        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = list(map(self._convert, not_parm))
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = list(map(self._convert, record.keys))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = 'range'
            opr_args = []
            if range_parm.find('min') > -1:
                opr_args.append('min')
            if range_parm.find('max') > -1:
                opr_args.append('max')

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == 'range':  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                r = multiunion(tmp)

                if cachekey is not None:
                    cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp

                # 'r' is not invariant of resultset. Thus, we
                # have to remember 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    # Prevent None from being looked up. None doesn't
                    # have a valid ordering definition compared to any
                    # other object. BTrees 4.0+ will throw a TypeError
                    # "object has default comparison".
                    continue
                try:
                    s = index.get(k, None)
                except TypeError:
                    # key is not valid for this Btree so the value is None
                    LOG.error(
                        '%(context)s: query_index tried '
                        'to look up key %(key)r from index %(index)r '
                        'but key was of the wrong type.', dict(
                            context=self.__class__.__name__,
                            key=k,
                            index=self.id,
                        )
                    )
                    s = None
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    if cachekey is not None:
                        # If operator is 'and', we have to cache a list of
                        # IISet objects
                        cache[cachekey] = [IISet()]
                    return IISet()
                elif isinstance(s, int):
                    s = IISet((s,))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.

                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)

                    # 'r' is not invariant of resultset.  Thus, we
                    # have to remember the union of 'setlist'. But
                    # this is maybe a performance killer. So we do not cache.
                    # if cachekey is not None:
                    #    cache[cachekey] = multiunion(setlist)

                else:
                    r = multiunion(setlist)
                    if cachekey is not None:
                        cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)

                # 'r' is not invariant of resultset. Thus, we
                # have to remember the union of 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet()
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r
def daterangeindex_apply_index(self, request, cid='', res=None):
    record = parseIndexRequest(request, self.getId())
    if record.keys is None:
        return None

    term = self._convertDateTime(record.keys[0])

    REQUEST = getattr(self, 'REQUEST', None)
    if REQUEST is not None:
        catalog = aq_parent(aq_parent(aq_inner(self)))
        if catalog is not None:
            key = '%s_%s'%(catalog.getId(), catalog.getCounter())
            cache = REQUEST.get(key, None)
            tid = isinstance(term, int) and term / 10 or 'None'
            index_id = self.getId()
            if res is None:
                cachekey = '_daterangeindex_%s_%s' % (index_id, tid)
            else:
                cachekey = '_daterangeindex_inverse_%s_%s' % (index_id, tid)
            if cache is None:
                cache = REQUEST[key] = RequestCache()
            else:
                cached = cache.get(cachekey, None)
                if cached is not None:
                    if res is None:
                        return cached, (self._since_field, self._until_field)
                    else:
                        return (difference(res, cached),
                            (self._since_field, self._until_field))

    if res is None:
        #
        #   Aggregate sets for each bucket separately, to avoid
        #   large-small union penalties.
        #   XXX Does this apply for multiunion?
        #
        until_only = multiunion(self._until_only.values(term))
        since_only = multiunion(self._since_only.values(None, term))
        until = multiunion(self._until.values(term))

        # Total result is bound by res
        if REQUEST is None:
            until = intersection(res, until)

        since = multiunion(self._since.values(None, term))
        bounded = intersection(until, since)
        result = multiunion([bounded, until_only, since_only, self._always])

        if REQUEST is not None and catalog is not None:
            cache[cachekey] = result

        return result, (self._since_field, self._until_field)
    else:
        # Compute the inverse and subtract from res
        until_only = multiunion(self._until_only.values(None, term - 1))
        since_only = multiunion(self._since_only.values(term + 1))
        until = multiunion(self._until.values(None, term - 1))
        since = multiunion(self._since.values(term + 1))

        result = multiunion([until_only, since_only, until, since])
        if REQUEST is not None and catalog is not None:
            cache[cachekey] = result
        return difference(res, result), (self._since_field, self._until_field)
示例#56
0
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id + '_encoding')
            if safe_callable(encoding):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source, encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word == last: continue
                last = word
                wordScores[word] = wordScores.get(word, 0) + 1

        # Convert scores to use wids:
        widScores = IIBucket()
        getWid = lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)] = score

        del wordScores

        currentWids = IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert = self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids = widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId] = wids

        return len(wids)
示例#57
0
from BTrees.IIBTree import IISet, union, intersection, difference


def make_choice(data, per):
    data_len = len(data)
    return [choice(data) for i in range(0, data_len * float(per) / 100.0)]


for max in (500, 2500, 5000, 10000, 25000, 50000, 100000):
    data = range(max)

    for p1, p2 in ((25, 25), (25, 50), (25, 75), (25, 100), (50, 50), (50, 75),
                   (50, 100), (75, 75), (75, 100), (100, 100)):

        d1 = IISet(make_choice(data, p1))
        d2 = IISet(make_choice(data, p2))

        ts = time.time()
        union(d1, d2)
        tu = time.time() - ts

        ts = time.time()
        intersection(d1, d2)
        ti = time.time() - ts

        ts = time.time()
        difference(d1, d2)
        td = time.time() - ts

        print '%6d %3d:%3d  %6.6f  %6.6f %6.6f' % (max, p1, p2, tu, ti, td)