def make_old_index(): from Products.PluginIndexes.TextIndex.TextIndex import TextIndex from Products.PluginIndexes.TextIndex.Lexicon import Lexicon from Products.ZCTextIndex.StopDict import get_stopdict l = Lexicon(get_stopdict()) l.SplitterFunc = MySplitter() return TextIndex("read", lexicon=l)
class StopWordRemover: dict = get_stopdict().copy() try: from Products.ZCTextIndex.stopper import process as _process except ImportError: def process(self, lst): has_key = self.dict.has_key return [w for w in lst if not has_key(w)] else: def process(self, lst): return self._process(self.dict, lst)
def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [w for w in re.split(r'\W+', version.lower()) if len(w) > 1 and w not in stop] word_seen = {} for w in words: if w not in word_seen: d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assertGreater(len(common), 0) self.assertGreater(len(unique), 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, 'did not find {0}'.format(w)) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual( total, 0, 'did not expect to find {0}'.format(w) )
def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndex() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print len(self.docpaths), "Document ids" print len(self.path2docid), "Pathnames" print self.index.lexicon.length(), "Words"
class StopWordAndSingleCharRemover(StopWordRemover): dict = get_stopdict().copy() for c in range(255): dict[chr(c)] = None
class StopWordRemover(object): dict = get_stopdict().copy() def process(self, lst): return [w for w in lst if w not in self.dict]