def __init__(self): #db = Connection().web_hunter self.index, self.graph = IndexHunt(),GraphHunt() self.crawl_web('http://udacity.com/cs101x/urank/index.html') self.ranks = {} self.compute_ranks()
class WebHunter: def __init__(self): #db = Connection().web_hunter self.index, self.graph = IndexHunt(),GraphHunt() self.crawl_web('http://udacity.com/cs101x/urank/index.html') self.ranks = {} self.compute_ranks() #db.index.insert(self.index) def qsort(self,tosort,ranks): if tosort == []: return [] else: pivot = tosort[0] lesser = self.qsort([x for x in tosort[1:] if ranks[x] < ranks[pivot]],ranks) greater = self.qsort([x for x in tosort[1:] if ranks[x] >= ranks[pivot]],ranks) return greater + [pivot] + lesser def ordered_search(self, keyword): keyword=keyword.lower() links = self.index.find(keyword) if links: return self.qsort(links,self.ranks) return None def get_page(self,url): if url in cache: return cache[url] return "" def union(self,a, b): for e in b: if e not in a: a.append(e) def add_page_to_index(self, url, content): words = content.split() for word in words: self.add_to_index( word, url) def add_to_index(self, keyword, url): links = self.index.find(keyword) if links: if url not in links: links.append(url) self.index.update(keyword,links) else: self.index.insert(keyword ,url) def lookup(self, keyword): links = self.index.find(keyword) if links: return links else: return None def crawl_web(self,seed): # returns index, graph of inlinks tocrawl = [seed] crawled = [] while tocrawl: page = tocrawl.pop() if page not in crawled: content = self.get_page(page) outlinks,content,title = LinkFinder().start_parsing(content) self.add_page_to_index( page, content) self.graph.insert(page,outlinks,title,content) self.union(tocrawl, outlinks) crawled.append(page) def compute_ranks(self): d = 0.8 # damping factor numloops = 10 npages = self.graph.size() for page in self.graph.find(): self.ranks[page["url"]] = 1.0 / npages for i in range(0, numloops): newranks = {} graph = self.graph.find() for page in graph: newrank = (1 - d) / npages for node in graph: #print page["url"], node["links"] if page["url"] in node["links"]: newrank = newrank + d * (self.ranks[node["url"]] / len(node["links"])) newranks[page["url"]] = newrank self.ranks = newranks