示例#1
0
    def test_config(self):
        '''Ensure that various config params are properly handled'''

        ### Test 'lowercase' param
        
        def _check_lc(index, golden_results):
            '''helper that checks index against golden_results'''
            for (term, golden_docs) in golden_results:
                self.assertEqual(
                    set(index.docnames_with_terms(term)), golden_docs)
                self.assertEqual(
                    set([doc for (doc, score) in index.query(term)]), golden_docs)
                
        # test data
        test_docs = (('doc1', 'Hello There'),
                     ('doc2', 'hello there'))

        # lowercase=True
        index = MemorySimIndex()
        index.set_config('lowercase', True)
        index.index_string_buffers(test_docs)
        golden_results = (('hello', {'doc1', 'doc2'}),
                          ('Hello', {'doc1', 'doc2'}),
                          ('HELLO', {'doc1', 'doc2'}))
        _check_lc(index, golden_results)
        
        # lowercase=False
        index = MemorySimIndex()
        index.set_config('lowercase', False)
        index.index_string_buffers(test_docs)
        golden_results = (('hello', {'doc2'}),
                          ('Hello', {'doc1'}),
                          ('HELLO', set()))
        _check_lc(index, golden_results)
示例#2
0
def sample_sim_index_collection():
    # SimIndexCollection
    print()
    print(
        "SimIndexCollection: build a collection, index some urls, and query it"
    )
    indexes = (MemorySimIndex(), MemorySimIndex())
    index_coll = SimIndexCollection()
    index_coll.add_shards(*indexes)
    index_coll.set_query_scorer('tfidf')
    index_coll.index_urls('http://www.stanford.edu/',
                          'http://www.berkeley.edu', 'http://www.ucla.edu',
                          'http://www.mit.edu')

    pprint(index_coll.query('stanford university'))
示例#3
0
    def setUp(self):
        print("SimIndexCollectionTest")
        self.sim_index = SimIndexCollection()
        for i in range(2):
            self.sim_index.add_shards(MemorySimIndex())

        super(SimIndexCollectionTest, self).setUp()
示例#4
0
def sample_sim_index():
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu',
                         'http://www.ucla.edu', 'http://www.mit.edu')

    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))

    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))

    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))

    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)

    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)

    print()
    print(
        "Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))
示例#5
0
 def setUp(self):
     print("ConcurrentSimIndexTest")
     self.sim_index = ConcurrentSimIndex(MemorySimIndex())
     super(ConcurrentSimIndexTest, self).setUp()
示例#6
0
 def setUp(self):
     print("MemorySimIndexTest")
     self.sim_index = MemorySimIndex()
     super(MemorySimIndexTest, self).setUp()