예제 #1
0
 def testThreeElementPipeline(self):
     lexicon = Lexicon(
         Splitter(),
         StopWordPipelineElement({"and": 1}),
         StupidPipelineElement("dogs", "fish"),
         WackyReversePipelineElement("fish"),
     )
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("hsif")
     self.assertEqual(wids, [2])
예제 #2
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)


    def _test_index_document_assertions(self, DOCID=1):
        self.assertEqual(self.index.documentCount(), 1)
        self.assertEqual(self.index.wordCount(), 5)
        self.assertEqual(self.lexicon.wordCount(), 5)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self._test_index_document_assertions(DOCID)

    def test_unindex_document_absent_docid(self):
        self.test_index_document(1)
        self.index.unindex_doc(2)
        self._test_index_document_assertions(1)

    def test_clear(self):
        self.test_index_document(1)
        self.index.clear()
        self._test_unindex_document_assertions()

    def _test_unindex_document_assertions(self):
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self._test_unindex_document_assertions()
        

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.wordCount())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
예제 #3
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("CATS and dogs")
     wids = lexicon.termToWordIds("cats and dogs")
     self.assertEqual(wids, [0, 2, 3])
예제 #4
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement("dogs", "fish"))
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("fish")
     self.assertEqual(wids, [3])
예제 #5
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("boxes")
     self.assertEqual(wids, [0])
예제 #6
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds("cats and dogs")
     wids = lexicon.termToWordIds("dogs")
     self.assertEqual(wids, [3])