示例#1
0
def sample_sim_index_collection():
    # SimIndexCollection
    print()
    print(
        "SimIndexCollection: build a collection, index some urls, and query it"
    )
    indexes = (MemorySimIndex(), MemorySimIndex())
    index_coll = SimIndexCollection()
    index_coll.add_shards(*indexes)
    index_coll.set_query_scorer('tfidf')
    index_coll.index_urls('http://www.stanford.edu/',
                          'http://www.berkeley.edu', 'http://www.ucla.edu',
                          'http://www.mit.edu')

    pprint(index_coll.query('stanford university'))
示例#2
0
    def setUp(self):
        print("SimIndexCollectionTest")
        self.sim_index = SimIndexCollection()
        for i in range(2):
            self.sim_index.add_shards(MemorySimIndex())

        super(SimIndexCollectionTest, self).setUp()
示例#3
0
 def test_save_load(self):
     '''Test save()/load() functionality'''
     with io.BytesIO() as output:
         self.sim_index.save(output)
         output.seek(0)
         loaded_sim_index = MemorySimIndex.load(output)
     self.sim_index = loaded_sim_index
     self.test_query_simple_scorer()  # make sure test_query() still works
示例#4
0
class MemorySimIndexTest(SimIndexTest, unittest.TestCase):
    '''
    All tests hitting the SimIndex interface are in the parent class, SimIndexTest
    
    Tests for api's not in parent class are tested separately here.  This is
    so we can reuse test code across all implementations of SimIndex.
    '''
    
    def setUp(self):
        print("MemorySimIndexTest")
        self.sim_index = MemorySimIndex()
        super(MemorySimIndexTest, self).setUp()

    def tearDown(self):
        pass
        
    def test_save_load(self):
        '''Test save()/load() functionality'''
        with io.BytesIO() as output:
            self.sim_index.save(output)
            output.seek(0)
            loaded_sim_index = MemorySimIndex.load(output)
        self.sim_index = loaded_sim_index
        self.test_query_simple_scorer()  # make sure test_query() still works
示例#5
0
def sample_sim_index():            
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/',
                         'http://www.berkeley.edu',
                         'http://www.ucla.edu',
                         'http://www.mit.edu')
    
    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))
    
    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))
    
    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))
    
    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)
    
    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)
    
    print()
    print("Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))
示例#6
0
 def setUp(self):
     print("ConcurrentSimIndexTest")
     self.sim_index = ConcurrentSimIndex(MemorySimIndex())
     super(ConcurrentSimIndexTest, self).setUp()
示例#7
0
 def setUp(self):
     print("MemorySimIndexTest")
     self.sim_index = MemorySimIndex()
     super(MemorySimIndexTest, self).setUp()
示例#8
0
    def test_config(self):
        '''Ensure that various config params are properly handled'''

        ### Test 'lowercase' param
        
        def _check_lc(index, golden_results):
            '''helper that checks index against golden_results'''
            for (term, golden_docs) in golden_results:
                self.assertEqual(
                    set(index.docnames_with_terms(term)), golden_docs)
                self.assertEqual(
                    set([doc for (doc, score) in index.query(term)]), golden_docs)
                
        # test data
        test_docs = (('doc1', 'Hello There'),
                     ('doc2', 'hello there'))

        # lowercase=True
        index = MemorySimIndex()
        index.set_config('lowercase', True)
        index.index_string_buffers(test_docs)
        golden_results = (('hello', {'doc1', 'doc2'}),
                          ('Hello', {'doc1', 'doc2'}),
                          ('HELLO', {'doc1', 'doc2'}))
        _check_lc(index, golden_results)
        
        # lowercase=False
        index = MemorySimIndex()
        index.set_config('lowercase', False)
        index.index_string_buffers(test_docs)
        golden_results = (('hello', {'doc2'}),
                          ('Hello', {'doc1'}),
                          ('HELLO', set()))
        _check_lc(index, golden_results)
示例#9
0
def sample_sim_index():
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu',
                         'http://www.ucla.edu', 'http://www.mit.edu')

    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))

    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))

    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))

    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)

    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)

    print()
    print(
        "Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))