def testThreeElementPipeline(self): lexicon = Lexicon(Splitter(), StopWordPipelineElement({'and': 1}), StupidPipelineElement('dogs', 'fish'), WackyReversePipelineElement('fish')) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('hsif') self.assertEqual(wids, [2])
def testTwoElementPipeline(self): lexicon = Lexicon(Splitter(), StupidPipelineElement('cats', 'fish'), WackyReversePipelineElement('fish')) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('hsif') self.assertEqual(wids, [1])
def testTermToWordIdsWithProcess_post_glob(self): """This test is for added process_post_glob""" class AddedSplitter(Splitter): def process_post_glob(self, lst): assert lst == ['dogs'] return ['dogs'] lexicon = Lexicon(AddedSplitter()) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('dogs') self.assertEqual(wids, [3])
class IndexTest(TestCase): def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon) def test_index_document(self, DOCID=1): doc = "simple document contains five words" self.assert_(not self.index.has_doc(DOCID)) self.index.index_doc(DOCID, doc) self.assert_(self.index.has_doc(DOCID)) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 1) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 5) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_unindex_document(self): DOCID = 1 self.test_index_document(DOCID) self.index.unindex_doc(DOCID) self.assertEqual(len(self.index._docweight), 0) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._wordinfo), self.index.length()) def test_index_two_documents(self): self.test_index_document() doc = "another document just four" DOCID = 2 self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 2) self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) wids = self.lexicon.termToWordIds("document") self.assertEqual(len(wids), 1) document_wid = wids[0] for wid, map in self.index._wordinfo.items(): if wid == document_wid: self.assertEqual(len(map), 2) self.assert_(map.has_key(1)) self.assert_(map.has_key(DOCID)) else: self.assertEqual(len(map), 1) def test_index_two_unindex_one(self): # index two documents, unindex one, and test the results self.test_index_two_documents() self.index.unindex_doc(1) DOCID = 2 self.assertEqual(len(self.index._docweight), 1) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_index_duplicated_words(self, DOCID=1): doc = "very simple repeat repeat repeat document test" self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 7) self.assertEqual(len(self.index._wordinfo), self.index.length()) wids = self.lexicon.termToWordIds("repeat") self.assertEqual(len(wids), 1) repititive_wid = wids[0] for wid, map in self.index._wordinfo.items(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_simple_query_oneresult(self): self.index.index_doc(1, 'not the same document') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_simple_query_noresults(self): self.index.index_doc(1, 'not the same document') results = self.index.search("frobnicate") self.assertEqual(list(results.keys()), []) def test_query_oneresult(self): self.index.index_doc(1, 'not the same document') self.index.index_doc(2, 'something about something else') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_search_phrase(self): self.index.index_doc(1, "the quick brown fox jumps over the lazy dog") self.index.index_doc(2, "the quick fox jumps lazy over the brown dog") results = self.index.search_phrase("quick brown fox") self.assertEqual(list(results.keys()), [1]) def test_search_glob(self): self.index.index_doc(1, "how now brown cow") self.index.index_doc(2, "hough nough browne cough") self.index.index_doc(3, "bar brawl") results = self.index.search_glob("bro*") self.assertEqual(list(results.keys()), [1, 2]) results = self.index.search_glob("b*") self.assertEqual(list(results.keys()), [1, 2, 3])
def testMissingTermToWordIds(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('boxes') self.assertEqual(wids, [0])
def testTermToWordIds(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('dogs') self.assertEqual(wids, [3])
def testSplitterAdaptorNofold(self): lexicon = Lexicon(Splitter()) wids = lexicon.sourceToWordIds('CATS and dogs') wids = lexicon.termToWordIds('cats and dogs') self.assertEqual(wids, [0, 2, 3])
def testOnePipelineElement(self): lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish')) wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('fish') self.assertEqual(wids, [3])
class IndexTest(object): # Subclasses must set a class variable IndexFactory to the appropriate # index object constructor. IndexFactory = None def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon) def test_index_document(self, docid=1): doc = 'simple document contains five words' self.assertFalse(self.index.has_doc(docid)) self.index.index_doc(docid, doc) self.assertTrue(self.index.has_doc(docid)) self.assertTrue(self.index._docweight[docid]) self.assertEqual(len(self.index._docweight), 1) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(docid)), 5) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assertIn(docid, map) def test_unindex_document(self): docid = 1 self.test_index_document(docid) self.index.unindex_doc(docid) self.assertEqual(len(self.index._docweight), 0) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._wordinfo), self.index.length()) def test_index_two_documents(self): self.test_index_document() doc = 'another document just four' docid = 2 self.index.index_doc(docid, doc) self.assertTrue(self.index._docweight[docid]) self.assertEqual(len(self.index._docweight), 2) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(docid)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) wids = self.lexicon.termToWordIds('document') self.assertEqual(len(wids), 1) document_wid = wids[0] for wid, map in self.index._wordinfo.items(): if wid == document_wid: self.assertEqual(len(map), 2) self.assertIn(1, map) self.assertIn(docid, map) else: self.assertEqual(len(map), 1) def test_index_two_unindex_one(self): # index two documents, unindex one, and test the results self.test_index_two_documents() self.index.unindex_doc(1) docid = 2 self.assertEqual(len(self.index._docweight), 1) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertTrue(self.index._docweight[docid]) self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(docid)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assertIn(docid, map) def test_index_duplicated_words(self, docid=1): doc = 'very simple repeat repeat repeat document test' self.index.index_doc(docid, doc) self.assertTrue(self.index._docweight[docid]) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(docid)), 7) self.assertEqual(len(self.index._wordinfo), self.index.length()) self.assertEqual( len(self.index._docweight), self.index.document_count()) wids = self.lexicon.termToWordIds('repeat') self.assertEqual(len(wids), 1) for wid, map in self.index._wordinfo.items(): self.assertEqual(len(map), 1) self.assertIn(docid, map) def test_simple_query_oneresult(self): self.index.index_doc(1, 'not the same document') results = self.index.search('document') self.assertEqual(list(results.keys()), [1]) def test_simple_query_noresults(self): self.index.index_doc(1, 'not the same document') results = self.index.search('frobnicate') self.assertEqual(list(results.keys()), []) def test_query_oneresult(self): self.index.index_doc(1, 'not the same document') self.index.index_doc(2, 'something about something else') results = self.index.search('document') self.assertEqual(list(results.keys()), [1]) def test_search_phrase(self): self.index.index_doc(1, 'the quick brown fox jumps over the lazy dog') self.index.index_doc(2, 'the quick fox jumps lazy over the brown dog') results = self.index.search_phrase('quick brown fox') self.assertEqual(list(results.keys()), [1]) def test_search_glob(self): self.index.index_doc(1, 'how now brown cow') self.index.index_doc(2, 'hough nough browne cough') self.index.index_doc(3, 'bar brawl') results = self.index.search_glob('bro*') self.assertEqual(list(results.keys()), [1, 2]) results = self.index.search_glob('b*') self.assertEqual(list(results.keys()), [1, 2, 3])
class IndexTest(TestCase): def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon) def test_index_document(self, DOCID=1): doc = "simple document contains five words" self.assert_(not self.index.has_doc(DOCID)) self.index.index_doc(DOCID, doc) self.assert_(self.index.has_doc(DOCID)) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 1) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 5) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_unindex_document(self): DOCID = 1 self.test_index_document(DOCID) self.index.unindex_doc(DOCID) self.assertEqual(len(self.index._docweight), 0) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._wordinfo), self.index.length()) def test_index_two_documents(self): self.test_index_document() doc = "another document just four" DOCID = 2 self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._docweight), 2) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) wids = self.lexicon.termToWordIds("document") self.assertEqual(len(wids), 1) document_wid = wids[0] for wid, map in self.index._wordinfo.items(): if wid == document_wid: self.assertEqual(len(map), 2) self.assert_(map.has_key(1)) self.assert_(map.has_key(DOCID)) else: self.assertEqual(len(map), 1) def test_index_two_unindex_one(self): # index two documents, unindex one, and test the results self.test_index_two_documents() self.index.unindex_doc(1) DOCID = 2 self.assertEqual(len(self.index._docweight), 1) self.assertEqual( len(self.index._docweight), self.index.document_count()) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index._wordinfo), self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_index_duplicated_words(self, DOCID=1): doc = "very simple repeat repeat repeat document test" self.index.index_doc(DOCID, doc) self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 7) self.assertEqual(len(self.index._wordinfo), self.index.length()) self.assertEqual( len(self.index._docweight), self.index.document_count()) wids = self.lexicon.termToWordIds("repeat") self.assertEqual(len(wids), 1) repititive_wid = wids[0] for wid, map in self.index._wordinfo.items(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) def test_simple_query_oneresult(self): self.index.index_doc(1, 'not the same document') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_simple_query_noresults(self): self.index.index_doc(1, 'not the same document') results = self.index.search("frobnicate") self.assertEqual(list(results.keys()), []) def test_query_oneresult(self): self.index.index_doc(1, 'not the same document') self.index.index_doc(2, 'something about something else') results = self.index.search("document") self.assertEqual(list(results.keys()), [1]) def test_search_phrase(self): self.index.index_doc(1, "the quick brown fox jumps over the lazy dog") self.index.index_doc(2, "the quick fox jumps lazy over the brown dog") results = self.index.search_phrase("quick brown fox") self.assertEqual(list(results.keys()), [1]) def test_search_glob(self): self.index.index_doc(1, "how now brown cow") self.index.index_doc(2, "hough nough browne cough") self.index.index_doc(3, "bar brawl") results = self.index.search_glob("bro*") self.assertEqual(list(results.keys()), [1, 2]) results = self.index.search_glob("b*") self.assertEqual(list(results.keys()), [1, 2, 3])