def test_symmetric(self): from zope.index.text.widcode import decode from zope.index.text.widcode import encode for wid in range(2**28, 1117): wids = [wid] code = encode(wids) self.assertEqual(decode(code), wids)
def test_encode_15_to_21_bits(self): from zope.index.text.widcode import encode for wid in range(2**14, 2**21, 255): mid, lo = divmod(wid, 128) hi, mid = divmod(mid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(mid) + chr(lo))
def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = IFTreeSet(old_wid2w.keys()) new_widset = IFTreeSet(new_wid2w.keys()) in_both_widset = intersection(old_widset, new_widset) only_old_widset = difference(old_widset, in_both_widset) only_new_widset = difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = widcode.encode(new_wids) return len(new_wids)
def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = self.family.IF.TreeSet(old_wid2w.keys()) new_widset = self.family.IF.TreeSet(new_wid2w.keys()) IF = self.family.IF in_both_widset = IF.intersection(old_widset, new_widset) only_old_widset = IF.difference(old_widset, in_both_widset) only_new_widset = IF.difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = widcode.encode(new_wids) return len(new_wids)
def test_symmetric(self): from zope.index.text.widcode import decode from zope.index.text.widcode import encode for wid in xrange(2**28, 1117): wids = [wid] code = encode(wids) self.assertEqual(decode(code), wids)
def test_encode_15_to_21_bits(self): from zope.index.text.widcode import encode for wid in xrange(2**14, 2**21, 255): mid, lo = divmod(wid, 128) hi, mid = divmod(mid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(mid) + chr(lo))
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = widcode.encode(wids) return len(wids)
def test_encode_22_to_28_bits(self): from zope.index.text.widcode import encode STEP = (256 * 512) - 1 for wid in range(2**21, 2**28, STEP): lmid, lo = divmod(wid, 128) hmid, lmid = divmod(lmid, 128) hi, hmid = divmod(hmid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(hmid) + chr(lmid) + chr(lo))
def test_encode_22_to_28_bits(self): from zope.index.text.widcode import encode STEP = (256 * 512) - 1 for wid in xrange(2**21, 2**28, STEP): lmid, lo = divmod(wid, 128) hmid, lmid = divmod(lmid, 128) hi, hmid = divmod(hmid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(hmid) + chr(lmid) + chr(lo))
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = widcode.encode(wids) try: self.documentCount.change(1) except AttributeError: # upgrade documentCount to Length object self.documentCount = Length.Length(len(self._docweight)) return len(wids)
def search_phrase(self, phrase): wids = self._lexicon.termToWordIds(phrase) cleaned_wids = self._remove_oov_wids(wids) if len(wids) != len(cleaned_wids): # At least one wid was OOV: can't possibly find it. return self.family.IF.BTree() scores = self._search_wids(wids) hits = mass_weightedIntersection(scores, self.family) if not hits: return hits code = widcode.encode(wids) result = self.family.IF.BTree() for docid, weight in hits.items(): docwords = self._docwords[docid] if docwords.find(code) >= 0: result[docid] = weight return result
def search_phrase(self, phrase): wids = self._lexicon.termToWordIds(phrase) cleaned_wids = self._remove_oov_wids(wids) if len(wids) != len(cleaned_wids): # At least one wid was OOV: can't possibly find it. return IFBTree() scores = self._search_wids(wids) hits = mass_weightedIntersection(scores) if not hits: return hits code = widcode.encode(wids) result = IFBTree() for docid, weight in hits.items(): docwords = self._docwords[docid] if docwords.find(code) >= 0: result[docid] = weight return result
def encode_wid(cls, l): return cls(widcode.encode(l))
def test_encode_8_to_14_bits(self): from zope.index.text.widcode import encode for wid in xrange(2**7, 2**14): hi, lo = divmod(wid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(lo))
def test_encode_1_to_7_bits(self): from zope.index.text.widcode import encode for wid in xrange(2**7): code = encode([wid]) self.assertEqual(code, chr(wid + 128))
def test_encode_1_to_7_bits(self): from zope.index.text.widcode import encode for wid in range(2**7): code = encode([wid]) self.assertEqual(code, chr(wid + 128))
def test_encode_8_to_14_bits(self): from zope.index.text.widcode import encode for wid in range(2**7, 2**14): hi, lo = divmod(wid, 128) code = encode([wid]) self.assertEqual(code, chr(hi + 128) + chr(lo))
def test_wid(): test_wids = list(range(5)) pw = pwid.PersistentWid.encode_wid(test_wids) assert isinstance(pw, persistent.Persistent) assert pw.decode_wid() == test_wids assert pw.find(widcode.encode([1]))