예제 #1
0
def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from zope.index.text.stopdict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
예제 #2
0
 def testThreeElementPipeline(self):
     lexicon = Lexicon(
         Splitter(),
         StopWordPipelineElement({"and": 1}),
         StupidPipelineElement("dogs", "fish"),
         WackyReversePipelineElement("fish"),
     )
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("hsif")
     self.assertEqual(wids, [2])
예제 #3
0
    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = doc
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)
예제 #4
0
 def _makeOne(self, family=None):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if family is None:
         family = self._getBTreesFamily()
     lexicon = Lexicon(Splitter())
     return self._getTargetClass()(lexicon, family=family)
예제 #5
0
파일: catalog.py 프로젝트: Jickelsen/Arche
def _default_indexes():
    return {
        'title':
        CatalogFieldIndex(get_title),
        'description':
        CatalogFieldIndex(get_description),
        'type_name':
        CatalogFieldIndex(get_type_name),
        'sortable_title':
        CatalogFieldIndex(get_sortable_title),
        'path':
        CatalogPathIndex(get_path),
        'searchable_text':
        CatalogTextIndex(get_searchable_text,
                         lexicon=Lexicon(Splitter(), CaseNormalizer())),
        'uid':
        CatalogFieldIndex(get_uid),
        'tags':
        CatalogKeywordIndex(get_tags),
        'search_visible':
        CatalogFieldIndex(get_search_visible),
        'date':
        CatalogFieldIndex(get_date),
        'modified':
        CatalogFieldIndex(get_modified),
        'created':
        CatalogFieldIndex(get_created),
        'wf_state':
        CatalogFieldIndex(get_wf_state),
        'workflow':
        CatalogFieldIndex(get_workflow),
    }.items()
예제 #6
0
 def _makeIndexAndParser(self):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     from zope.index.text.queryparser import QueryParser
     lexicon = Lexicon(Splitter())
     parser = QueryParser(lexicon)
     index = FauxIndex()
     return index, parser
예제 #7
0
    def __init__(self, lexicon=None, index=None):
        """Provisional constructor.

        This creates the lexicon and index if not passed in.
        """
        _explicit_lexicon = True
        if lexicon is None:
            _explicit_lexicon = False
            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
        if index is None:
            index = OkapiIndex(lexicon)
        self.lexicon = _explicit_lexicon and lexicon or index.lexicon
        self.index = index
예제 #8
0
파일: tfidf.py 프로젝트: dwyerk/dedupe
    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms
예제 #9
0
 def __init__(self, discriminator, lexicon=None, index=None):
     _lexicon = lexicon
     if lexicon is None:
         _lexicon = Lexicon(
             Splitter(),
             CaseNormalizer(),
             StopWordRemover(),
         )
     if index is None:
         index = OkapiIndex(_lexicon, family=self.family)
     super(TextIndex, self).__init__(discriminator, lexicon, index)
     if lexicon is None:
         self.lexicon = index.lexicon
     self.index = index
     self.clear()
예제 #10
0
def stopWords(data) :
    index = TextIndex(Lexicon(Splitter()))

    for i, (_, doc) in enumerate(data, 1) :
        index.index_doc(i, doc)

    doc_freq = [(len(index.index._wordinfo[wid]), word) 
                for word, wid in index.lexicon.items()]

    doc_freq.sort(reverse=True)

    N = float(index.index.documentCount())
    threshold = int(max(1000, N * 0.05))

    stop_words = set([])

    for frequency, word in doc_freq :
        if frequency > threshold :
            stop_words.add(word)
        else :
            break

    return stop_words
예제 #11
0
def _create_full_text_index(self, language):
    lexicon = Lexicon(
        FullTextIndexProcessor(language, self.stemming)
    )
    return OkapiIndex(lexicon)
예제 #12
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("dogs")
     self.assertEqual(wids, [3])
예제 #13
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement("dogs", "fish"))
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("fish")
     self.assertEqual(wids, [3])
예제 #14
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("boxes")
     self.assertEqual(wids, [0])
예제 #15
0
 def setUp(self):
     self.lexicon = Lexicon(Splitter())
     self.index = self.IndexFactory(self.lexicon)
예제 #16
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)


    def _test_index_document_assertions(self, DOCID=1):
        self.assertEqual(self.index.documentCount(), 1)
        self.assertEqual(self.index.wordCount(), 5)
        self.assertEqual(self.lexicon.wordCount(), 5)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self._test_index_document_assertions(DOCID)

    def test_unindex_document_absent_docid(self):
        self.test_index_document(1)
        self.index.unindex_doc(2)
        self._test_index_document_assertions(1)

    def test_clear(self):
        self.test_index_document(1)
        self.index.clear()
        self._test_unindex_document_assertions()

    def _test_unindex_document_assertions(self):
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self._test_unindex_document_assertions()
        

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
예제 #17
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("CATS and dogs")
     wids = lexicon.termToWordIds("cats and dogs")
     self.assertEqual(wids, [0, 2, 3])
예제 #18
0
 def _makeLexicon(self, *pipeline):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if not pipeline:
         pipeline = (Splitter(),)
     return Lexicon(*pipeline)
예제 #19
0
 def _makeLexicon(self):
     from zope.index.text.lexicon import Lexicon
     return Lexicon(*self._makePipeline())