示例#1
0
def sample_sim_index():            
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/',
                         'http://www.berkeley.edu',
                         'http://www.ucla.edu',
                         'http://www.mit.edu')
    
    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))
    
    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))
    
    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))
    
    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)
    
    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)
    
    print()
    print("Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))
示例#2
0
def sample_sim_index():
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu',
                         'http://www.ucla.edu', 'http://www.mit.edu')

    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))

    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))

    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))

    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)

    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)

    print()
    print(
        "Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))
示例#3
0
class MemorySimIndexTest(SimIndexTest, unittest.TestCase):
    '''
    All tests hitting the SimIndex interface are in the parent class, SimIndexTest
    
    Tests for api's not in parent class are tested separately here.  This is
    so we can reuse test code across all implementations of SimIndex.
    '''
    
    def setUp(self):
        print("MemorySimIndexTest")
        self.sim_index = MemorySimIndex()
        super(MemorySimIndexTest, self).setUp()

    def tearDown(self):
        pass
        
    def test_save_load(self):
        '''Test save()/load() functionality'''
        with io.BytesIO() as output:
            self.sim_index.save(output)
            output.seek(0)
            loaded_sim_index = MemorySimIndex.load(output)
        self.sim_index = loaded_sim_index
        self.test_query_simple_scorer()  # make sure test_query() still works