def test_config(self): '''Ensure that various config params are properly handled''' ### Test 'lowercase' param def _check_lc(index, golden_results): '''helper that checks index against golden_results''' for (term, golden_docs) in golden_results: self.assertEqual( set(index.docnames_with_terms(term)), golden_docs) self.assertEqual( set([doc for (doc, score) in index.query(term)]), golden_docs) # test data test_docs = (('doc1', 'Hello There'), ('doc2', 'hello there')) # lowercase=True index = MemorySimIndex() index.set_config('lowercase', True) index.index_string_buffers(test_docs) golden_results = (('hello', {'doc1', 'doc2'}), ('Hello', {'doc1', 'doc2'}), ('HELLO', {'doc1', 'doc2'})) _check_lc(index, golden_results) # lowercase=False index = MemorySimIndex() index.set_config('lowercase', False) index.index_string_buffers(test_docs) golden_results = (('hello', {'doc2'}), ('Hello', {'doc1'}), ('HELLO', set())) _check_lc(index, golden_results)
def sample_sim_index_collection(): # SimIndexCollection print() print( "SimIndexCollection: build a collection, index some urls, and query it" ) indexes = (MemorySimIndex(), MemorySimIndex()) index_coll = SimIndexCollection() index_coll.add_shards(*indexes) index_coll.set_query_scorer('tfidf') index_coll.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') pprint(index_coll.query('stanford university'))
def setUp(self): print("SimIndexCollectionTest") self.sim_index = SimIndexCollection() for i in range(2): self.sim_index.add_shards(MemorySimIndex()) super(SimIndexCollectionTest, self).setUp()
def sample_sim_index(): # Create an in-memory index and query it print() print("Creating in-memory index of university homepages") sim_index = MemorySimIndex() sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') print("Postings list for 'university':") pprint(sim_index.postings_list('university')) print("Pages containing terms 'university' and 'california'") pprint(list(sim_index.docnames_with_terms('university', 'california'))) # Issue some similarity queries print() print("Similarity search for query 'stanford university' (simple scorer)") sim_index.set_query_scorer('simple_count') pprint(list(sim_index.query("stanford university"))) print() print("Similarity search for query 'stanford university' (tf.idf scorer)") sim_index.set_query_scorer('tfidf') pprint(list(sim_index.query("stanford university"))) # Save the index to disk, then load it back in print() print("Saving index to disk") with open("myindex.idx", "w") as index_file: sim_index.save(index_file) print() print("Loading index from disk") with open("myindex.idx", "r") as index_file: sim_index2 = MemorySimIndex.load(index_file) print() print( "Pages containing terms 'university' and 'california' in loaded index") pprint(list(sim_index2.docnames_with_terms('university', 'california')))
def setUp(self): print("ConcurrentSimIndexTest") self.sim_index = ConcurrentSimIndex(MemorySimIndex()) super(ConcurrentSimIndexTest, self).setUp()
def setUp(self): print("MemorySimIndexTest") self.sim_index = MemorySimIndex() super(MemorySimIndexTest, self).setUp()