Пример #1
0
def sample_sim_index():            
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/',
                         'http://www.berkeley.edu',
                         'http://www.ucla.edu',
                         'http://www.mit.edu')
    
    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))
    
    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))
    
    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))
    
    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)
    
    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)
    
    print()
    print("Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))
Пример #2
0
def sample_sim_index():
    # Create an in-memory index and query it
    print()
    print("Creating in-memory index of university homepages")
    sim_index = MemorySimIndex()
    sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu',
                         'http://www.ucla.edu', 'http://www.mit.edu')

    print("Postings list for 'university':")
    pprint(sim_index.postings_list('university'))
    print("Pages containing terms 'university' and 'california'")
    pprint(list(sim_index.docnames_with_terms('university', 'california')))

    # Issue some similarity queries
    print()
    print("Similarity search for query 'stanford university' (simple scorer)")
    sim_index.set_query_scorer('simple_count')
    pprint(list(sim_index.query("stanford university")))

    print()
    print("Similarity search for query 'stanford university' (tf.idf scorer)")
    sim_index.set_query_scorer('tfidf')
    pprint(list(sim_index.query("stanford university")))

    # Save the index to disk, then load it back in
    print()
    print("Saving index to disk")
    with open("myindex.idx", "w") as index_file:
        sim_index.save(index_file)

    print()
    print("Loading index from disk")
    with open("myindex.idx", "r") as index_file:
        sim_index2 = MemorySimIndex.load(index_file)

    print()
    print(
        "Pages containing terms 'university' and 'california' in loaded index")
    pprint(list(sim_index2.docnames_with_terms('university', 'california')))