def test_invIndex_equals_resInvIndex(crawler): print "3) Testing to see if the inverted index correlates with the resolved inverted index..." inverted_index = crawler.get_inverted_index() res_inverted_index = crawler.get_resolved_inverted_index() word_wordID_check = True url_docID_check = True for word, urls in res_inverted_index.items(): word_id = crawler._word_id_cache[word] #Check if each word in the resolved inverted index corresponds with the wordID in the inverted index if word_id not in inverted_index: print "Word: ", word, " is not linked" word_wordID_check = False inv_indx_docIDs = inverted_index[word_id] for url in urls: docID = crawler._doc_id_cache[url] #Check if each URL in the resolved inverted index corresponds with the correct documentID in the inverted index if docID not in inv_indx_docIDs: print "URL: ", url, " is not linked" url_docID_check = False if not word_wordID_check or not url_docID_check: return False return True
def test_inverted_index(crawler): print "1) Testing the inverted index..." inverted_index = crawler.get_inverted_index() #Checking if the inverted index contains all the wordIDs wordID_check = True for word in crawler._word_id_cache: word_id = crawler._word_id_cache[word] if word_id not in inverted_index: print "Word: ", word, " is not in the inverted_index" wordID_check = False if not wordID_check: return False #Checking if the inverted index contains valid documentIDs docID_check = True for wordID, docIDs in inverted_index.items(): for docID in docIDs: if docID not in crawler._doc_id_cache.values(): print "DocID: ", docID, " is not in the _doc_id_cache" docID_check = False if not docID_check: return False return True
def backEnd_run(dep): # Crawl through the URLs provided in urls.txt crawler.crawl(depth=int(dep)) # Retrieve Data needed for populating the SQL Tables doc_index = crawler.get_docs_cache() inverted_index = crawler.get_inverted_index() anchor_db = crawler.get_anchor_db() lexicon = crawler.get_lexicon() pg_rank = page_rank(crawler.get_links_queue()) titles_list = crawler.get_title_cache() resolved_inverted_index = crawler.get_resovled_inverted_index() description = crawler.get_desc_cache() images = crawler.get_image_cache() return doc_index, titles_list, lexicon, anchor_db, pg_rank, inverted_index, description, images, resolved_inverted_index
import os import sys from crawler import crawler print "Now starting the test...." print crawler = crawler(None, "urls.txt") inverted_index = crawler.get_inverted_index() print print "inverted_index is......" print print inverted_index print print resolved_inverted_index = crawler.get_resolved_inverted_index() print "resolved_inverted_index is......" print print resolved_inverted_index
crawler = crawler(None, "url-for-test.txt") crawler.crawl(depth=1) URL_1 = "https://marksachinperera.github.io/" URL_2 = "https://marksachinperera.github.io/ContactMe.html" URL_3 = "https://marksachinperera.github.io/AboutMe.html" ###****************#### ID_1 = -1 ID_2 = -1 ID_3 = -1 ###****************#### print "getting inverted index" inverted_index = crawler.get_inverted_index() print "getting resolved index" resolved_index = crawler.get_resolved_inverted_index() print "Setting up" word_list_1 = { "jpg", "height", "done", "have", "home", "portfolio", "alt", "web", "le", "img", "personal", "mark", "width", "1500", "styles", "picture", "resume", "showing", "welcome", "hi", "img_2315", "perera", "projects", "me", "src", "about", "name", "1800", "this", "contact", "my", "page" } word_list_2 = { "feedback", "links", "ca", "coming", "soon", "jpg", "height", "home",