Пример #1
0
def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from zope.index.text.stopdict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
Пример #2
0
class StopWordAndSingleCharRemover(StopWordRemover):
    """
    A simple :class:`zope.index.text.interfaces.IPipelineElement`
    to remove stop words and words of a single character.
    """
    dict = get_stopdict().copy()
    for c in range(255):
        dict[chr(c)] = None
Пример #3
0
class StopWordRemover(object):
    """
    A simple :class:`zope.index.text.interfaces.IPipelineElement`
    to remove stop words.

    .. seealso:: :func:`.get_stopdict`
    """

    dict = get_stopdict().copy()

    def process(self, lst):
        return [w for w in lst if not w in self.dict]
Пример #4
0
    def __init__(self, 
                 predicates, 
                 stop_words = None) :

        if stop_words is None :
            stop_words = defaultdict(lambda : set(get_stopdict()))

        self.predicates = predicates

        self.stop_words = stop_words

        self.tfidf_fields = defaultdict(set)

        for full_predicate in predicates :
            for predicate in full_predicate :
                if predicate.type == "TfidfPredicate" :
                    self.tfidf_fields[predicate.field].add(predicate)
Пример #5
0
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndexWrapper()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print(len(self.docpaths), "Document ids")
     print(len(self.path2docid), "Pathnames")
     print(self.index.lexicon.length(), "Words")
Пример #6
0
 def __init__(self, stop_words) :
     self.stop_words = set(get_stopdict().keys())
     self.stop_words.update(stop_words)
Пример #7
0
 def __init__(self, stop_words):
     self.stop_words = set(get_stopdict().keys())
     self.stop_words.update(stop_words)
Пример #8
0
class StopWordAndSingleCharRemover(StopWordRemover):

    dict = get_stopdict().copy()
    for c in range(255):
        dict[chr(c)] = None
Пример #9
0
class StopWordRemover(object):

    dict = get_stopdict().copy()

    def process(self, lst):
        return [w for w in lst if not w in self.dict]