Пример #1
0
 def test_ZCTextIndex(self):
     from xml.dom.minidom import parseString
     from Products.ZCTextIndex.ZCTextIndex import PLexicon
     from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
     from Products.GenericSetup.testing import DummySetupEnviron
     from Products.GenericSetup.ZCTextIndex.exportimport \
             import ZCTextIndexNodeAdapter
     _XML = """\
     <index name="foo_zctext" meta_type="ZCTextIndex">
     <indexed_attr value="bar"/>
     <extra name="index_type" value="Okapi BM25 Rank"/>
     <extra name="lexicon_id" value="foo_plexicon"/>
     </index>
     """
     environ = DummySetupEnviron()
     def _no_clear(*a):
         raise AssertionError("Don't clear me!")
     catalog = DummyCatalog()
     catalog.foo_plexicon = PLexicon('foo_plexicon')
     extra = _extra()
     extra.lexicon_id = 'foo_plexicon'
     extra.index_type='Okapi BM25 Rank'
     index = ZCTextIndex('foo_field', extra=extra, field_name='bar',
                         caller=catalog).__of__(catalog)
     index.clear = _no_clear 
     adapted = ZCTextIndexNodeAdapter(index, environ)
     adapted.node = parseString(_XML).documentElement # no raise
Пример #2
0
 def setUp(self):
     self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                             StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                                 'text', 'lexicon')
     self.index = self.zc_index.index
Пример #3
0
    def setUp(self):
        self._catalog = self._makeOne()
        self._catalog.lexicon = PLexicon('lexicon')
        col1 = FieldIndex('col1')
        col2 = ZCTextIndex('col2', caller=self._catalog,
                          index_factory=OkapiIndex, lexicon_id='lexicon')
        col3 = KeywordIndex('col3')

        self._catalog.addIndex('col1', col1)
        self._catalog.addIndex('col2', col2)
        self._catalog.addIndex('col3', col3)
        self._catalog.addColumn('col1')
        self._catalog.addColumn('col2')
        self._catalog.addColumn('col3')

        att1 = FieldIndex('att1')
        att2 = ZCTextIndex('att2', caller=self._catalog,
                          index_factory=OkapiIndex, lexicon_id='lexicon')
        att3 = KeywordIndex('att3')
        num = FieldIndex('num')

        self._catalog.addIndex('att1', att1)
        self._catalog.addIndex('att2', att2)
        self._catalog.addIndex('att3', att3)
        self._catalog.addIndex('num', num)
        self._catalog.addColumn('att1')
        self._catalog.addColumn('att2')
        self._catalog.addColumn('att3')
        self._catalog.addColumn('num')

        for x in range(0, self.upper):
            self._catalog.catalogObject(dummy(self.nums[x]), repr(x))
        self._catalog = self._catalog.__of__(dummy('foo'))
Пример #4
0
    def test_ZCTextIndex(self):
        from xml.dom.minidom import parseString
        from Products.ZCTextIndex.ZCTextIndex import PLexicon
        from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
        from Products.GenericSetup.testing import DummySetupEnviron
        from Products.GenericSetup.ZCTextIndex.exportimport \
                import ZCTextIndexNodeAdapter
        _XML = """\
        <index name="foo_zctext" meta_type="ZCTextIndex">
        <indexed_attr value="bar"/>
        <extra name="index_type" value="Okapi BM25 Rank"/>
        <extra name="lexicon_id" value="foo_plexicon"/>
        </index>
        """
        environ = DummySetupEnviron()

        def _no_clear(*a):
            raise AssertionError("Don't clear me!")

        catalog = DummyCatalog()
        catalog.foo_plexicon = PLexicon('foo_plexicon')
        extra = _extra()
        extra.lexicon_id = 'foo_plexicon'
        extra.index_type = 'Okapi BM25 Rank'
        index = ZCTextIndex('foo_field',
                            extra=extra,
                            field_name='bar',
                            caller=catalog).__of__(catalog)
        index.clear = _no_clear
        adapted = ZCTextIndexNodeAdapter(index, environ)
        adapted.node = parseString(_XML).documentElement  # no raise
Пример #5
0
 def testAddTextIndex(self):
     self._catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('id', caller=self._catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     self._catalog.addIndex('id', idx)
     i = self._catalog.indexes['id']
     self.assert_(isinstance(i, ZCTextIndex), 'add text index failed')
Пример #6
0
    def _make_one(self, extra=None):
        from Products.ZCatalog.Catalog import Catalog
        catalog = Catalog()
        catalog.lexicon = PLexicon('lexicon')
        att1 = FieldIndex('att1')
        att2 = ZCTextIndex('att2',
                           caller=catalog,
                           index_factory=OkapiIndex,
                           lexicon_id='lexicon')
        catalog.addIndex('att2', att2)
        num = FieldIndex('num')

        catalog.addIndex('att1', att1)
        catalog.addIndex('num', num)
        catalog.addColumn('num')

        foo = MultiFieldIndex('foo')
        catalog.addIndex('foo', foo)

        if extra is not None:
            extra(catalog)

        for x in range(0, self.upper):
            catalog.catalogObject(Dummy(self.nums[x]), repr(x))
        return catalog.__of__(Dummy('foo'))
Пример #7
0
 def test_add_text_index(self):
     catalog = self._make_one()
     catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('id', caller=catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     catalog.addIndex('id', idx)
     i = catalog.indexes['id']
     self.assertIsInstance(i, ZCTextIndex)
Пример #8
0
class QueryTestsBase(object):

    # Subclasses of QueryTestsBase must set a class variable IndexFactory
    # to the kind of index to be constructed.
    IndexFactory = None

    # The FauxIndex in testQueryEngine contains four documents.
    # docid 1: foo, bar, ham
    # docid 2: bar, ham
    # docid 3: foo, ham
    # docid 4: ham

    docs = ['foo bar ham', 'bar ham', 'foo ham', 'ham']

    def setUp(self):
        self.lexicon = PLexicon('lexicon', '',
                                Splitter(),
                                CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)

        self.zc_index = ZCTextIndex('name',
                                    None,
                                    caller,
                                    self.IndexFactory,
                                    'text',
                                    'lexicon')
        self.parser = QueryParser(self.lexicon)
        self.index = self.zc_index.index
        self.add_docs()

    def add_docs(self):
        for i in range(len(self.docs)):
            text = self.docs[i]
            obj = Indexable(text)
            self.zc_index.index_object(i + 1, obj)

    def compareSet(self, set, dict):
        # The FauxIndex and the real Index score documents very
        # differently.  The set comparison can't actually compare the
        # items, but it can compare the keys.  That will have to do for now.
        setkeys = list(set.keys())
        dictkeys = list(dict.keys())
        setkeys.sort()
        dictkeys.sort()
        self.assertEqual(setkeys, dictkeys)
Пример #9
0
 def testDelTextIndex(self):
     self._catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('id', caller=self._catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     self._catalog.addIndex('id', idx)
     self._catalog.delIndex('id')
     self.assert_('id' not in self._catalog.indexes,
                  'del index failed')
Пример #10
0
 def test_del_text_index(self):
     catalog = self._make_one()
     catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('id', caller=catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     catalog.addIndex('id', idx)
     catalog.delIndex('id')
     self.assertNotIn('id', catalog.indexes)
Пример #11
0
 def testLexiconIsNotFoundRaisesLookupError(self):
     caller = LexiconHolder(self.lexicon)
     with self.assertRaises(LookupError):
         ZCTextIndex(
             'name',
             extra=None,
             caller=caller,
         )
Пример #12
0
def index(rt, mboxfile, db, profiler):
    global NUM
    idx_time = 0
    pack_time = 0
    start_time = time.time()

    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
    extra = Extra()
    extra.lexicon_id = 'lexicon'
    extra.doc_attr = 'text'
    extra.index_type = 'Okapi BM25 Rank'
    caller = Extra()
    caller.lexicon = lexicon
    rt["index"] = idx = ZCTextIndex("index", extra, caller)
    if not EXCLUDE_TEXT:
        rt["documents"] = docs = IOBTree()
    else:
        docs = None
    transaction.commit()

    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
    if VERBOSE:
        print "opened", mboxfile
    if not NUM:
        NUM = sys.maxint

    if profiler:
        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
    else:
        itime, ptime, i = indexmbox(mbox, idx, docs, db)
    idx_time += itime
    pack_time += ptime

    transaction.commit()

    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
        if VERBOSE >= 2:
            print "packing one last time..."
        p0 = time.clock()
        db.pack(time.time())
        p1 = time.clock()
        if VERBOSE:
            print "pack took %s sec" % (p1 - p0)
        pack_time += p1 - p0

    if VERBOSE:
        finish_time = time.time()
        print
        print "Index time", round(idx_time / 60, 3), "minutes"
        print "Pack time", round(pack_time / 60, 3), "minutes"
        print "Index bytes", Message.total_bytes
        rate = (Message.total_bytes / idx_time) / 1024
        print "Index rate %.2f KB/sec" % rate
        print "Indexing began", time.ctime(start_time)
        print "Indexing ended", time.ctime(finish_time)
        print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
Пример #13
0
def make_zc_index():
    # there's an elaborate dance necessary to construct an index
    class Struct:
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex("read", extra, caller)
Пример #14
0
 def getLexicon(self):
     """Get the lexicon for this index
     """
     try:
         return ZCTextIndex.getLexicon(self)
     except:
         lexicon = getattr(getToolByName(getSite(), 'portal_catalog'), self.lexicon_id)
         if not ILexicon.providedBy(lexicon):
             raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
                             % repr(lexicon))
         self._v_lexicon = lexicon
         return lexicon
Пример #15
0
    def setUp(self):
        from Products.ZCTextIndex.ZCTextIndex import PLexicon
        from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex

        catalog = DummyCatalog()
        catalog.foo_plexicon = PLexicon('foo_plexicon')
        extra = _extra()
        extra.lexicon_id = 'foo_plexicon'
        extra.index_type = 'Okapi BM25 Rank'
        self._obj = ZCTextIndex('foo_zctext', extra=extra,
                                caller=catalog).__of__(catalog)
        self._XML = _ZCTEXT_XML
 def getLexicon(self):
     """Get the lexicon for this index
     """
     try:
         return ZCTextIndex.getLexicon(self)
     except:
         lexicon = getattr(getToolByName(getSite(), 'portal_catalog'), self.lexicon_id)
         if not ILexicon.providedBy(lexicon):
             raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
                             % repr(lexicon))
         self._v_lexicon = lexicon
         return lexicon
Пример #17
0
    def testInvalidIndexTypeRaisesValueError(self):
        caller = LexiconHolder(self.lexicon)

        class Extra(object):
            index_type = 'Some invalid index type'
        with self.assertRaises(ValueError):
            ZCTextIndex(
                'name',
                extra=Extra,
                caller=caller,
                index_factory=None,
                lexicon_id='lexicon'
            )
 def setUp(self):
     self.lexicon = PLexicon('lexicon', '',
                             Splitter(),
                             CaseNormalizer(),
                             StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     self.zc_index = ZCTextIndex('name',
                                 None,
                                 caller,
                                 self.IndexFactory,
                                 'text',
                                 'lexicon')
     self.index = self.zc_index.index
Пример #19
0
 def testMultipleAttributes(self):
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('foo bar', 'alpha omega')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('foo')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha gamma')
     self.assertEqual(len(nbest), 0)
Пример #20
0
    def test_fixOkapiIndexes(self):
        catalog = ZCatalog('catalog')
        catalog.lexicon = PLexicon('lexicon')
        catalog.addIndex(
            'test',
            ZCTextIndex('test',
                        index_factory=OkapiIndex,
                        caller=catalog,
                        lexicon_id='lexicon'))
        catalog.Indexes['test'].index._totaldoclen = -1000

        from plone.app.upgrade.v41.final import fixOkapiIndexes
        fixOkapiIndexes(catalog)
        self.assertEqual(0, catalog.Indexes['test'].index._totaldoclen())
Пример #21
0
    def _make_one(self):
        from Products.ZCatalog.Catalog import Catalog
        catalog = Catalog()
        catalog.lexicon = PLexicon('lexicon')
        att1 = FieldIndex('att1')
        att2 = ZCTextIndex('att2', caller=catalog,
                           index_factory=OkapiIndex, lexicon_id='lexicon')
        att3 = KeywordIndex('att3')
        catalog.addIndex('att1', att1)
        catalog.addIndex('att2', att2)
        catalog.addIndex('att3', att3)

        for x in range(0, self.upper):
            catalog.catalogObject(Dummy(x), repr(x))
        return catalog.__of__(Dummy('foo'))
Пример #22
0
 def setUp(self):
     self._catalog = self._makeOne()
     self._catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('title', caller=self._catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     self._catalog.addIndex('title', idx)
     self._catalog.addIndex('true', FieldIndex('true'))
     self._catalog.addColumn('title')
     cat = self._get_catalog()
     for i in (1, 2, 3, 10, 11, 110, 111):
         obj = zdummy(i)
         obj.true = True
         if i == 110:
             obj.true = False
         cat.catalogObject(obj, str(i))
Пример #23
0
 def _make_one(self):
     from Products.ZCatalog.Catalog import Catalog
     catalog = Catalog()
     catalog.lexicon = PLexicon('lexicon')
     idx = ZCTextIndex('title', caller=catalog,
                       index_factory=OkapiIndex, lexicon_id='lexicon')
     catalog.addIndex('title', idx)
     catalog.addIndex('true', FieldIndex('true'))
     catalog.addColumn('title')
     for i in (1, 2, 3, 10, 11, 110, 111):
         obj = ZDummy(i)
         obj.true = True
         if i == 110:
             obj.true = False
         catalog.catalogObject(obj, str(i))
     return catalog.__of__(ZDummy(1))
Пример #24
0
 def testListAttributes(self):
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('Hello Tim', [
         'Now is the winter of our discontent',
         'Made glorious summer by this sun of York',
     ])
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('York Tim')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('Tuesday Tim York')
     self.assertEqual(len(nbest), 0)
Пример #25
0
 def setUp(self):
     self.catalogs = []
     for i in range(3):
         cat = self._makeOne()
         cat.lexicon = PLexicon('lexicon')
         cat.addIndex('num', FieldIndex('num'))
         cat.addIndex('big', FieldIndex('big'))
         cat.addIndex('number', FieldIndex('number'))
         i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex,
                         lexicon_id='lexicon')
         cat.addIndex('title', i)
         cat = cat.__of__(zdummy(16336))
         for i in range(10):
             obj = zdummy(i)
             obj.big = i > 5
             obj.number = True
             cat.catalogObject(obj, str(i))
         self.catalogs.append(cat)
Пример #26
0
 def _make_many(self):
     from Products.ZCatalog.Catalog import mergeResults
     catalogs = []
     for i in range(3):
         cat = self._make_one()
         cat.lexicon = PLexicon('lexicon')
         cat.addIndex('num', FieldIndex('num'))
         cat.addIndex('big', FieldIndex('big'))
         cat.addIndex('number', FieldIndex('number'))
         i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex,
                         lexicon_id='lexicon')
         cat.addIndex('title', i)
         cat = cat.__of__(ZDummy(16336))
         for i in range(10):
             obj = ZDummy(i)
             obj.big = i > 5
             obj.number = True
             cat.catalogObject(obj, str(i))
         catalogs.append(cat)
     return catalogs, mergeResults
Пример #27
0
    def __init__(self, id='Help', title=''):
        self.id = id
        self.title = title
        c = self.catalog = ZCatalog('catalog')

        l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(),
                     StopWordRemover())
        c._setObject('lexicon', l)
        i = ZCTextIndex('SearchableText',
                        caller=c,
                        index_factory=OkapiIndex,
                        lexicon_id=l.id)
        # not using c.addIndex because it depends on Product initialization
        c._catalog.addIndex('SearchableText', i)
        c._catalog.addIndex('categories', KeywordIndex('categories'))
        c._catalog.addIndex('permissions', KeywordIndex('permissions'))
        c.addColumn('categories')
        c.addColumn('permissions')
        c.addColumn('title_or_id')
        c.addColumn('url')
        c.addColumn('id')
 def testMultipleAttributes(self):
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name',
                            None,
                            caller,
                            self.IndexFactory,
                            'text1,text2',
                            'lexicon')
     doc = Indexable2('foo bar', 'alpha omega')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('foo')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha gamma')
     self.assertEqual(len(nbest), 0)
 def testListAttributes(self):
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name',
                            None,
                            caller,
                            self.IndexFactory,
                            'text1,text2',
                            'lexicon')
     doc = Indexable2('Hello Tim',
                      ['Now is the winter of our discontent',
                       'Made glorious summer by this sun of York', ])
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('York Tim')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('Tuesday Tim York')
     self.assertEqual(len(nbest), 0)
Пример #30
0
class ZCIndexTestsBase:
    def setUp(self):
        self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                                    'text', 'lexicon')
        self.index = self.zc_index.index

    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                               'text1,text2', 'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                               'text1,text2', 'lexicon')
        doc = Indexable2('Hello Tim', \
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ("to be or not to be " "that is the question")
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != "question":
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1  # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {}  # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [
                w for w in re.split("\W+", version.lower())
                if len(w) > 1 and not stop.has_key(w)
            ]
            word_seen = {}
            for w in words:
                if not word_seen.has_key(w):
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {}  # version number -> list of words unique to that version
        common = []  # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assert_(len(common) > 0)
        self.assert_(len(unique) > 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, "did not find %s" % w)
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(total, 0, "did not expect to find %s" % w)
Пример #31
0
class ZCIndexTestsBase:

    def setUp(self):
        self.lexicon = PLexicon('lexicon', '',
                                Splitter(),
                                CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name',
                                    None,
                                    caller,
                                    self.IndexFactory,
                                    'text',
                                    'lexicon')
        self.index = self.zc_index.index


    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        lexicon = PLexicon('lexicon', '',
                            Splitter(),
                            CaseNormalizer(),
                            StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                                None,
                                caller,
                                self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        lexicon = PLexicon('lexicon', '',
                            Splitter(),
                            CaseNormalizer(),
                            StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                                None,
                                caller,
                                self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('Hello Tim', \
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ("to be or not to be "
                "that is the question")
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != "question":
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1   # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {} # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [w for w in re.split("\W+", version.lower())
                     if len(w) > 1 and not stop.has_key(w)]
            word_seen = {}
            for w in words:
                if not word_seen.has_key(w):
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {} # version number -> list of words unique to that version
        common = [] # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assert_(len(common) > 0)
        self.assert_(len(unique) > 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, "did not find %s" % w)
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(total, 0, "did not expect to find %s" % w)
Пример #32
0
class ZCIndexTestsBase(object):

    def setUp(self):
        self.lexicon = PLexicon('lexicon', '',
                                Splitter(),
                                CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name',
                                    None,
                                    caller,
                                    self.IndexFactory,
                                    'text',
                                    'lexicon')
        self.index = self.zc_index.index

    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('Hello Tim',
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testReindex(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text',
                               'lexicon')
        doc = Indexable('Hello Tim')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 0)
        nbest, total = zc_index.query('Tim')
        self.assertEqual(len(nbest), 1)
        # reindex with another value
        doc.text = 'Goodbye George'
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('Tim')
        self.assertEqual(len(nbest), 0)
        nbest, total = zc_index.query('Goodbye')
        self.assertEqual(len(nbest), 1)
        # reindex with an empty value
        doc.text = ''
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('George')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ('to be or not to be '
                'that is the question')
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != 'question':
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1   # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {}  # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [w for w in re.split(r'\W+', version.lower())
                     if len(w) > 1 and w not in stop]
            word_seen = {}
            for w in words:
                if w not in word_seen:
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {}  # version number -> list of words unique to that version
        common = []  # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assertGreater(len(common), 0)
        self.assertGreater(len(unique), 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, 'did not find {0}'.format(w))
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(
                        total, 0,
                        'did not expect to find {0}'.format(w)
                    )

    def testLexiconIsNotFoundRaisesLookupError(self):
        caller = LexiconHolder(self.lexicon)
        with self.assertRaises(LookupError):
            ZCTextIndex(
                'name',
                extra=None,
                caller=caller,
            )

    def testInvalidIndexTypeRaisesValueError(self):
        caller = LexiconHolder(self.lexicon)

        class Extra(object):
            index_type = 'Some invalid index type'
        with self.assertRaises(ValueError):
            ZCTextIndex(
                'name',
                extra=Extra,
                caller=caller,
                index_factory=None,
                lexicon_id='lexicon'
            )
Пример #33
0
 def testReindex(self):
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name',
                            None,
                            caller,
                            self.IndexFactory,
                            'text',
                            'lexicon')
     doc = Indexable('Hello Tim')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 1)
     # reindex with another value
     doc.text = 'Goodbye George'
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Goodbye')
     self.assertEqual(len(nbest), 1)
     # reindex with an empty value
     doc.text = ''
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('George')
     self.assertEqual(len(nbest), 0)