Exemplo n.º 1
0
class TextIndex(Persistent):

    def __init__(self):
        self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
        self.index = OkapiIndex(self.lexicon)

    def index_text(self, docid, text):
        self.index.index_doc(docid, text)
        self._p_changed = 1 # XXX

    def unindex(self, docid):
        self.index.unindex_doc(docid)
        self._p_changed = 1 # XXX

    def query(self, query, nbest=10):
        # returns a total hit count and a mapping from docids to scores
        parser = QueryParser(self.lexicon)
        tree = parser.parseQuery(query)
        results = tree.executeQuery(self.index)
        if results is None:
            return [], 0
        chooser = NBest(nbest)
        chooser.addmany(results.items())
        return chooser.getbest(), len(results)

    def query_weight(self, query):
        parser = QueryParser(self.lexicon)
        tree = parser.parseQuery(query)
        terms = tree.terms()
        return self.index.query_weight(terms)
Exemplo n.º 2
0
class TextIndex(Persistent):
    def __init__(self):
        self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
        self.index = OkapiIndex(self.lexicon)

    def index_text(self, docid, text):
        self.index.index_doc(docid, text)
        self._p_changed = 1  # XXX

    def unindex(self, docid):
        self.index.unindex_doc(docid)
        self._p_changed = 1  # XXX

    def query(self, query, nbest=10):
        # returns a total hit count and a mapping from docids to scores
        parser = QueryParser(self.lexicon)
        tree = parser.parseQuery(query)
        results = tree.executeQuery(self.index)
        if results is None:
            return [], 0
        chooser = NBest(nbest)
        chooser.addmany(results.items())
        return chooser.getbest(), len(results)

    def query_weight(self, query):
        parser = QueryParser(self.lexicon)
        tree = parser.parseQuery(query)
        terms = tree.terms()
        return self.index.query_weight(terms)
Exemplo n.º 3
0
class TestUpgrade(TestCase):

    def test_query_before_totaldoclen_upgrade(self):
        self.index1 = OkapiIndex(Lexicon(Splitter()))
        self.index1.index_doc(0, 'The quiet of night')
        # Revert index1 back to a long to simulate an older index instance
        self.index1._totaldoclen = int(self.index1._totaldoclen())
        self.assertEqual(len(self.index1.search('night')), 1)

    def test_upgrade_totaldoclen(self):
        self.index1 = OkapiIndex(Lexicon())
        self.index2 = OkapiIndex(Lexicon())
        self.index1.index_doc(0, 'The quiet of night')
        self.index2.index_doc(0, 'The quiet of night')
        # Revert index1 back to a long to simulate an older index instance
        self.index1._totaldoclen = int(self.index1._totaldoclen())
        self.index1.index_doc(1, 'gazes upon my shadow')
        self.index2.index_doc(1, 'gazes upon my shadow')
        self.assertEqual(
            self.index1._totaldoclen(), self.index2._totaldoclen())
        self.index1._totaldoclen = int(self.index1._totaldoclen())
        self.index1.unindex_doc(0)
        self.index2.unindex_doc(0)
        self.assertEqual(
            self.index1._totaldoclen(), self.index2._totaldoclen())

    def test_query_before_document_count_upgrade(self):
        self.index1 = OkapiIndex(Lexicon(Splitter()))
        self.index1.index_doc(0, 'The quiet of night')
        # Revert index1 back to a long to simulate an older index instance
        del self.index1.document_count
        self.assertEqual(len(self.index1.search('night')), 1)

    def test_upgrade_document_count(self):
        self.index1 = OkapiIndex(Lexicon())
        self.index2 = OkapiIndex(Lexicon())
        self.index1.index_doc(0, 'The quiet of night')
        self.index2.index_doc(0, 'The quiet of night')
        # Revert index1 back to simulate an older index instance
        del self.index1.document_count
        self.index1.index_doc(1, 'gazes upon my shadow')
        self.index2.index_doc(1, 'gazes upon my shadow')
        self.assertIs(self.index1.document_count.__class__, Length)
        self.assertEqual(
            self.index1.document_count(), self.index2.document_count())
        del self.index1.document_count
        self.index1.unindex_doc(0)
        self.index2.unindex_doc(0)
        self.assertIs(self.index1.document_count.__class__, Length)
        self.assertEqual(
            self.index1.document_count(), self.index2.document_count())