示例#1
0
 def test_symmetric(self):
     from zope.index.text.widcode import decode
     from zope.index.text.widcode import encode
     for wid in range(2**28, 1117):
         wids = [wid]
         code = encode(wids)
         self.assertEqual(decode(code), wids)
示例#2
0
 def test_encode_15_to_21_bits(self):
     from zope.index.text.widcode import encode
     for wid in range(2**14, 2**21, 255):
         mid, lo = divmod(wid, 128)
         hi, mid = divmod(mid, 128)
         code = encode([wid])
         self.assertEqual(code, chr(hi + 128) + chr(mid) + chr(lo))
示例#3
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = IFTreeSet(old_wid2w.keys())
        new_widset = IFTreeSet(new_wid2w.keys())

        in_both_widset = intersection(old_widset, new_widset)
        only_old_widset = difference(old_widset, in_both_widset)
        only_new_widset = difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = widcode.encode(new_wids)
        return len(new_wids)
示例#4
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = self.family.IF.TreeSet(old_wid2w.keys())
        new_widset = self.family.IF.TreeSet(new_wid2w.keys())

        IF = self.family.IF
        in_both_widset = IF.intersection(old_widset, new_widset)
        only_old_widset = IF.difference(old_widset, in_both_widset)
        only_new_widset = IF.difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = widcode.encode(new_wids)
        return len(new_wids)
示例#5
0
 def test_symmetric(self):
     from zope.index.text.widcode import decode
     from zope.index.text.widcode import encode
     for wid in xrange(2**28, 1117):
         wids = [wid]
         code = encode(wids)
         self.assertEqual(decode(code), wids)
示例#6
0
 def test_encode_15_to_21_bits(self):
     from zope.index.text.widcode import encode
     for wid in xrange(2**14, 2**21, 255):
         mid, lo = divmod(wid, 128)
         hi, mid = divmod(mid, 128)
         code = encode([wid])
         self.assertEqual(code, chr(hi + 128) + chr(mid) + chr(lo))
示例#7
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self._reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = widcode.encode(wids)
     return len(wids)
示例#8
0
 def test_encode_22_to_28_bits(self):
     from zope.index.text.widcode import encode
     STEP = (256 * 512) - 1
     for wid in range(2**21, 2**28, STEP):
         lmid, lo = divmod(wid, 128)
         hmid, lmid = divmod(lmid, 128)
         hi, hmid = divmod(hmid, 128)
         code = encode([wid])
         self.assertEqual(code,
                          chr(hi + 128) + chr(hmid) + chr(lmid) + chr(lo))
示例#9
0
 def test_encode_22_to_28_bits(self):
     from zope.index.text.widcode import encode
     STEP = (256 * 512) - 1
     for wid in xrange(2**21, 2**28, STEP):
         lmid, lo = divmod(wid, 128)
         hmid, lmid = divmod(lmid, 128)
         hi, hmid = divmod(hmid, 128)
         code = encode([wid])
         self.assertEqual(code,
                          chr(hi + 128) + chr(hmid) + chr(lmid) + chr(lo))
示例#10
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self._reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = widcode.encode(wids)
     try:
         self.documentCount.change(1)
     except AttributeError:
         # upgrade documentCount to Length object
         self.documentCount = Length.Length(len(self._docweight))
     return len(wids)
示例#11
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self._reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = widcode.encode(wids)
     try:
         self.documentCount.change(1)
     except AttributeError:
         # upgrade documentCount to Length object
         self.documentCount = Length.Length(len(self._docweight))
     return len(wids)
示例#12
0
 def search_phrase(self, phrase):
     wids = self._lexicon.termToWordIds(phrase)
     cleaned_wids = self._remove_oov_wids(wids)
     if len(wids) != len(cleaned_wids):
         # At least one wid was OOV:  can't possibly find it.
         return self.family.IF.BTree()
     scores = self._search_wids(wids)
     hits = mass_weightedIntersection(scores, self.family)
     if not hits:
         return hits
     code = widcode.encode(wids)
     result = self.family.IF.BTree()
     for docid, weight in hits.items():
         docwords = self._docwords[docid]
         if docwords.find(code) >= 0:
             result[docid] = weight
     return result
示例#13
0
 def search_phrase(self, phrase):
     wids = self._lexicon.termToWordIds(phrase)
     cleaned_wids = self._remove_oov_wids(wids)
     if len(wids) != len(cleaned_wids):
         # At least one wid was OOV:  can't possibly find it.
         return IFBTree()
     scores = self._search_wids(wids)
     hits = mass_weightedIntersection(scores)
     if not hits:
         return hits
     code = widcode.encode(wids)
     result = IFBTree()
     for docid, weight in hits.items():
         docwords = self._docwords[docid]
         if docwords.find(code) >= 0:
             result[docid] = weight
     return result
示例#14
0
 def encode_wid(cls, l):
     return cls(widcode.encode(l))
示例#15
0
 def test_encode_8_to_14_bits(self):
     from zope.index.text.widcode import encode
     for wid in xrange(2**7, 2**14):
         hi, lo = divmod(wid, 128)
         code = encode([wid])
         self.assertEqual(code, chr(hi + 128) + chr(lo))
示例#16
0
 def test_encode_1_to_7_bits(self):
     from zope.index.text.widcode import encode
     for wid in xrange(2**7):
         code = encode([wid])
         self.assertEqual(code, chr(wid + 128))
示例#17
0
 def test_encode_1_to_7_bits(self):
     from zope.index.text.widcode import encode
     for wid in range(2**7):
         code = encode([wid])
         self.assertEqual(code, chr(wid + 128))
示例#18
0
 def test_encode_8_to_14_bits(self):
     from zope.index.text.widcode import encode
     for wid in range(2**7, 2**14):
         hi, lo = divmod(wid, 128)
         code = encode([wid])
         self.assertEqual(code, chr(hi + 128) + chr(lo))
示例#19
0
文件: pwid.py 项目: yonglehou/zerodb
 def encode_wid(cls, l):
     return cls(widcode.encode(l))
示例#20
0
def test_wid():
    test_wids = list(range(5))
    pw = pwid.PersistentWid.encode_wid(test_wids)
    assert isinstance(pw, persistent.Persistent)
    assert pw.decode_wid() == test_wids
    assert pw.find(widcode.encode([1]))