예제 #1
0
def test_invIndex_equals_resInvIndex(crawler):
    print "3) Testing to see if the inverted index correlates with the resolved inverted index..."
    
    inverted_index = crawler.get_inverted_index()
    res_inverted_index = crawler.get_resolved_inverted_index()

    word_wordID_check = True
    url_docID_check = True

    for word, urls in res_inverted_index.items():
	word_id = crawler._word_id_cache[word]
	
	#Check if each word in the resolved inverted index corresponds with the wordID in the inverted index
	if word_id not in inverted_index:
	    print "Word: ", word, " is not linked"
	    word_wordID_check = False
	
	inv_indx_docIDs = inverted_index[word_id]
	for url in urls:
	    docID = crawler._doc_id_cache[url]
	    
	    #Check if each URL in the resolved inverted index corresponds with the correct documentID in the inverted index
	    if docID not in inv_indx_docIDs:
		print "URL: ", url, " is not linked"
		url_docID_check = False

    if not word_wordID_check or not url_docID_check:
	return False
    
    return True
예제 #2
0
def test_inverted_index(crawler):
    print "1) Testing the inverted index..."

    inverted_index = crawler.get_inverted_index()

    #Checking if the inverted index contains all the wordIDs
    wordID_check = True
    for word in crawler._word_id_cache:
	word_id = crawler._word_id_cache[word]
	if word_id not in inverted_index:
	    print "Word: ", word, " is not in the inverted_index"
	    wordID_check = False

    if not wordID_check:
	return False

    
    #Checking if the inverted index contains valid documentIDs
    docID_check = True
    for wordID, docIDs in inverted_index.items():
	for docID in docIDs:
	    if docID not in crawler._doc_id_cache.values():
	        print "DocID: ", docID, " is not in the _doc_id_cache"
	        docID_check = False

    if not docID_check:
	return False

    return True
예제 #3
0
def backEnd_run(dep):
    # Crawl through the URLs provided in urls.txt
    crawler.crawl(depth=int(dep))

    # Retrieve Data needed for populating the SQL Tables
    doc_index = crawler.get_docs_cache()
    inverted_index = crawler.get_inverted_index()
    anchor_db = crawler.get_anchor_db()
    lexicon = crawler.get_lexicon()
    pg_rank = page_rank(crawler.get_links_queue())
    titles_list = crawler.get_title_cache()
    resolved_inverted_index = crawler.get_resovled_inverted_index()
    description = crawler.get_desc_cache()
    images = crawler.get_image_cache()

    return doc_index, titles_list, lexicon, anchor_db, pg_rank, inverted_index, description, images, resolved_inverted_index
예제 #4
0
import os
import sys
from crawler import crawler


print "Now starting the test...."
print 
crawler = crawler(None, "urls.txt")
inverted_index = crawler.get_inverted_index()
print
print "inverted_index is......"
print
print inverted_index
print 
print
resolved_inverted_index = crawler.get_resolved_inverted_index()
print "resolved_inverted_index is......"
print
print resolved_inverted_index


crawler = crawler(None, "url-for-test.txt")
crawler.crawl(depth=1)

URL_1 = "https://marksachinperera.github.io/"
URL_2 = "https://marksachinperera.github.io/ContactMe.html"
URL_3 = "https://marksachinperera.github.io/AboutMe.html"

###****************####
ID_1 = -1
ID_2 = -1
ID_3 = -1
###****************####

print "getting inverted index"
inverted_index = crawler.get_inverted_index()

print "getting resolved index"
resolved_index = crawler.get_resolved_inverted_index()

print "Setting up"

word_list_1 = {
    "jpg", "height", "done", "have", "home", "portfolio", "alt", "web", "le",
    "img", "personal", "mark", "width", "1500", "styles", "picture", "resume",
    "showing", "welcome", "hi", "img_2315", "perera", "projects", "me", "src",
    "about", "name", "1800", "this", "contact", "my", "page"
}

word_list_2 = {
    "feedback", "links", "ca", "coming", "soon", "jpg", "height", "home",