def make_old_index(): from Products.PluginIndexes.TextIndex.TextIndex import TextIndex from Products.PluginIndexes.TextIndex.Lexicon import Lexicon from zope.index.text.stopdict import get_stopdict l = Lexicon(get_stopdict()) l.SplitterFunc = MySplitter() return TextIndex("read", lexicon=l)
class StopWordAndSingleCharRemover(StopWordRemover): """ A simple :class:`zope.index.text.interfaces.IPipelineElement` to remove stop words and words of a single character. """ dict = get_stopdict().copy() for c in range(255): dict[chr(c)] = None
class StopWordRemover(object): """ A simple :class:`zope.index.text.interfaces.IPipelineElement` to remove stop words. .. seealso:: :func:`.get_stopdict` """ dict = get_stopdict().copy() def process(self, lst): return [w for w in lst if not w in self.dict]
def __init__(self, predicates, stop_words = None) : if stop_words is None : stop_words = defaultdict(lambda : set(get_stopdict())) self.predicates = predicates self.stop_words = stop_words self.tfidf_fields = defaultdict(set) for full_predicate in predicates : for predicate in full_predicate : if predicate.type == "TfidfPredicate" : self.tfidf_fields[predicate.field].add(predicate)
def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words")
def __init__(self, stop_words) : self.stop_words = set(get_stopdict().keys()) self.stop_words.update(stop_words)
def __init__(self, stop_words): self.stop_words = set(get_stopdict().keys()) self.stop_words.update(stop_words)
class StopWordAndSingleCharRemover(StopWordRemover): dict = get_stopdict().copy() for c in range(255): dict[chr(c)] = None
class StopWordRemover(object): dict = get_stopdict().copy() def process(self, lst): return [w for w in lst if not w in self.dict]