def calc_page_ranks(self, d=0.85): self.adj = numpy.zeros( (len(self.pages_with_ids),len(self.pages_with_ids)) ) pbar = ProgressBar(widgets=['Processing links: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) # magic PageRank for a in page.a: href = a.get('href') # normalize URLS url = page.normalize_url(href) if url in self.S: soup = BeautifulSoup(helpers.get_html(self.urls_with_nums[url]).encode('utf-8', 'ignore'), 'lxml') ID = helpers.page_hash(soup.prettify()) if ID in self.pages_with_ids.keys(): #print "%s (#%d) cites %s (#%d)" % (page.num, page.index, self.pages_with_ids[ID].num, self.pages_with_ids[ID].index) #print self.urls[int(self.pages_with_ids[ID].num)-1] self.adj[page.index][self.pages_with_ids[ID].index] = 1.0 progress += 1 # Normalize adjacency matrix into PageRanks pbar = ProgressBar(widgets=['Normalizing adjacencies: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 col_sums = numpy.sum(self.adj, axis=1) for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) for k in xrange(len(self.adj[page.index])): if col_sums[page.index] != 0: self.adj[page.index][k] = self.adj[page.index][k] / col_sums[page.index] else: self.adj[page.index][k] = 0.0 self.indices_with_pages[k] progress += 1 pbar.finish() numpy.savetxt("adj.txt", self.adj) # Run PageRank and converge to principal eigenvector of adj matrix self.ranks = numpy.ones(len(self.pages_with_ids.keys())) z = numpy.ones(len(self.pages_with_ids.keys())) b = 1.0 - d pbar = ProgressBar(widgets=['Running PageRank: ', SimpleProgress()], maxval=1000).start() for m in xrange(1000): pbar.update(m) u = numpy.dot(self.adj, self.ranks) e = d*u f = b*z self.ranks = e+f pbar.finish() # Updating ranks of the pages pbar = ProgressBar(widgets=['Updating pages with new ranks: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) page.rank = self.ranks[page.index] progress += 1 pbar.finish() numpy.savetxt("page_ranks.txt", self.ranks)
def __init__(self, title, num, html, url, text): self.ID = helpers.page_hash(html) self.num = num self.title = title self.urls = [url] self.anchor_texts = [] # also contains alt text of <img>'s within <a></a> self.inlinks = 0.0 self.rank = 0.0 self.snippet = ' '.join(text.split(' ')[100:110]) self.a = [] self.index = 0