def test_triche_pagerank(n=3000): k = 2 #on ajoute 2 personne qui vont tricher G = core.construire_G(n) vec = np.real(pr.page_rank(G)) G2 = np.zeros((n + k, n + k)) G2[:n, :n] = G for i in range(k): G2[n + i, n - 1] = 1 vec2 = np.real(pr.page_rank(G2)) return vec, vec2
def get_docid_score(self): page_rank_score = pagerank.page_rank(self._links_set) for docid in page_rank_score: self._docid_score[docid] = page_rank_score[docid] return self._docid_score
def generate_page_ranks(self, links): """Generate page ranks of links and store in database. Return pageranks dictionary""" page_ranks = pr.page_rank(links.keys()) # (Sai) Insert into page rank table in db for doc_id, pagerank in page_ranks.items(): query_str = "INSERT INTO pageRanks VALUES(%d, %f);" % (doc_id, pagerank) self._cur.execute(query_str) self._db_conn.commit() # Pages in page ranks dictionary: pages_in_pr = set(page_ranks.keys()) # Pages in document index: doc_index_pages = set() for elem in self._document_index: doc_index_pages.add(elem[0]) # Pages in document index but not page ranks missing_pages = doc_index_pages - pages_in_pr # Add missing pages to page rank table with rank of 0: for page in missing_pages: query_str = "INSERT INTO pageRanks VALUES(%d, 0);" % page self._cur.execute(query_str) self._db_conn.commit() return page_ranks
def crawl(self, depth=2, timeout=3): """Crawl the web!""" threadqueue = [] start = time.time() self._max_depth = depth for i in range(self.MAX_THREADS): thread = threading.Thread(target=self.crawler_thread, args=(i, depth, timeout)) thread.start() threadqueue.append(thread) for i in range(self.MAX_THREADS): threadqueue[i].join() mid = time.time() self._invert_index() invert = time.time() self._scores = page_rank(self._links, depth) scoring = time.time() self._store_data() end = time.time() print "time spent crawling: %d" % (mid - start) print "time spent inverting: %d" % (invert - mid) print "time spent scoring: %d" % (scoring - invert) print "time spent storing: %d" % (end - scoring) print "num errors %d" % self.errors for word in sorted(self._doc_id_cache): #print word pass links = [(self._doc_cache[x], self._doc_cache[y], d) for x, y, d in self._links] for word in sorted(links): if word[2] <= depth: #print word pass
def page_rank_to_DB(self): rankings = page_rank(self._links_cache) for doc_id in rankings: pageRankPost = { 'doc_id': doc_id, 'url_ranks': rankings[doc_id] } pageRankDB.insert_one(pageRankPost)
def page_rank_calculation(self, iterations=20, initial_pr=0.85 ): if self.db_conn.cursor(): self.cur.execute('SELECT * FROM Links;') data = self.cur.fetchall() ranked_list = pagerank.page_rank(data, iterations, initial_pr) for entry in ranked_list: #self._mock_next_doc_id self.cur.execute( """INSERT OR REPLACE INTO PageRank (doc_id, rank) VALUES('%s', '%s');""" % ( entry, ranked_list[entry]) ) self.db_conn.commit()
def calculate_pagerank(self): ranks = page_rank(self._links) for doc_id, rank in ranks.iteritems(): vals = (doc_id, rank) self._cursor.execute( "INSERT OR IGNORE INTO pageRank VALUES (?, ?)", vals) self._conn.commit()
def insert_pagerank(self): """Insert the page ranking of the specific page accessed""" if len(self._links_cache) > 0: link_rankings = page_rank(self._links_cache) for doc_id, doc_rank in link_rankings.iteritems(): self._db_cursor.execute( "INSERT OR IGNORE INTO page_rank(doc_id, doc_rank) VALUES (%d, %f);" % (doc_id, doc_rank) )
def generate_page_ranks(self, links): """Generate page ranks of links and store in database. Return pageranks dictionary""" page_ranks = pr.page_rank(links.keys()) # (Sai) Insert into redis for doc_id, pagerank in page_ranks.items(): redis_ret_val = self._r_conn.zadd("pageranks", str(doc_id), str(pagerank)) return page_ranks
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() self._max_depth = depth start = time.time() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() print " url=" + str(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() mid = time.time() self._invert_index() self._scores = page_rank(self._links, max_depth=self._max_depth, num_iterations=20) self._store_data() end = time.time() print "time spent crawling: %d" % (mid - start) print "time spent storing: %d" % (end - mid)
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() #print (" url=" + repr(self._curr_url)) except Exception as e: print(e) pass finally: if socket: socket.close() # NEW in lab3 add inverted_index in the db for i in self._inverted_index: l = [] for j in self._inverted_index[i]: l.append(j) dic = {'words_id': i, 'doc_id': l} store('inverted_index', dic) # NEW FOR LAB3 ADD THE PAGERANK self._rank_page = page_rank( list(zip(self._from_doc_list, self._to_doc_list))) for i in self._rank_page: dic = {'doc_id': i, 'score': self._rank_page[i]} store('page_rank', dic)
def doc_id_index(links, inverted_doc_id, desc): """ Build doc_id_index """ pr_results = page_rank(links) mongo_doc = [{"_id": doc_id, "pageRank": link_id, "url": inverted_doc_id[doc_id], "title": desc[doc_id]['title'], "description": desc[doc_id]['description']} for doc_id,link_id in pr_results.items()] write_records(mongo_doc, "csc326", "doc_id_index")
def create_rows(dependencygroup, leadingpath): """Returns a list of rows for a dependencygroup.""" converter = pagerank.DependenciesToLinkMatrix(dependencygroup.dependencies) matrix = converter.create_matrix() ranking = pagerank.page_rank(matrix) ids = [idx for idx in range(len(matrix))] filenames = [utils.prettify_path(converter.id_to_node_map[nid], leadingpath) for nid in ids] rowinfos = zip(filenames, ranking, ids, matrix) rowinfos.sort(key=lambda item: item[1]) # sort by ranking rowinfos.reverse() return rowinfos
def create_rows(dependencygroup, leadingpath): """Returns a list of rows for a dependencygroup.""" converter = pagerank.DependenciesToLinkMatrix(dependencygroup.dependencies) matrix = converter.create_matrix() ranking = pagerank.page_rank(matrix) ids = [idx for idx in range(len(matrix))] filenames = [utils.prettify_path(converter.id_to_node_map[nid], leadingpath) for nid in ids] rowinfos = zip(filenames, ranking, ids, matrix) rowinfos.sort(key=lambda item: item[1]) #sort by ranking rowinfos.reverse() return rowinfos
def crawler_page_ranks(self): calculatedRanks = page_rank(self._links) # order by greatest pg to least # create a list of tuples pageRanks = [] for page in calculatedRanks: for doc_id, url in self.doc_ids: if doc_id is page: pageRanks.append((url, calculatedRanks[page])) #sort the list by descending page ranks pageRanks.sort(key=lambda tup: tup[1], reverse=True) # store to database self.dbconnection.set('pageranks', pageRanks) return pageRanks
def update_database(self): con = sqlite3.connect('dbFile.db') cur = con.cursor() cur.execute("DELETE FROM PageRank_Doc") cur.execute("DELETE FROM Words") #returns dict of docid -> pagerank number page_rank_temp = page_rank(self.get_links()) for key, value in page_rank_temp.iteritems(): if key in self._doc_id_to_doc_title_cache: # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(key) + ", " + self._id_doc_cache[key] + ", " + str(value) + ", " + self._doc_id_to_doc_title_cache[key] + ")" cur.execute( "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)", (key, self._id_doc_cache[key], value, self._doc_id_to_doc_title_cache[key])) else: # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(key) + ", " + self._id_doc_cache[key] + ", " + str(value) + ", " + "Unknown" + ")" cur.execute( "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)", (key, self._id_doc_cache[key], value, "Unknown Title")) for key, value in self._doc_id_cache.items(): # if doc_id does not exist inside database, because it does not have a page rank score, save it's page rank score as 0. if value not in page_rank_temp: if value in self._doc_id_to_doc_title_cache: # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(value) + ", " + str(key) + ", " + "0 " + ", " + self._doc_id_to_doc_title_cache[value] + ")" cur.execute( "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)", (value, key, 0, self._doc_id_to_doc_title_cache[value])) else: # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(value) + ", " + str(key) + ", " + "0 " + ", " + "unknown" + ")" cur.execute( "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)", (value, key, 0, "Unknown Title")) # key = word_id, value = word (string) for key, value in self._id_word_cache.items(): for id in set(self._word_id_to_doc_id_cache[key]): print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str( key) + ", " + str(value) + ", " + str(id) + ")" cur.execute("INSERT OR IGNORE INTO Words VALUES (?, ?, ?)", (key, value, id)) con.commit() con.close()
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [ ] self._index_document(soup) self._add_words_to_document() print " url="+repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() print(self.links) self.pagerank = pagerank.page_rank(self.links, num_iterations=20, initial_pr=1.0) # thread.start_new_thread(self.pagerank_db_update,("1",self)) # thread.start_new_thread(self.index_db_update,("1",self)) thread.start_new_thread(update_everything,("1",self)) self.pagerank_db_update() self.index_db_update()
def backEnd_run(dep): # Crawl through the URLs provided in urls.txt crawler.crawl(depth=int(dep)) # Retrieve Data needed for populating the SQL Tables doc_index = crawler.get_docs_cache() inverted_index = crawler.get_inverted_index() anchor_db = crawler.get_anchor_db() lexicon = crawler.get_lexicon() pg_rank = page_rank(crawler.get_links_queue()) titles_list = crawler.get_title_cache() resolved_inverted_index = crawler.get_resovled_inverted_index() description = crawler.get_desc_cache() images = crawler.get_image_cache() return doc_index, titles_list, lexicon, anchor_db, pg_rank, inverted_index, description, images, resolved_inverted_index
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() print " url=" + repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() pageranks = page_rank(self.links) print "pagerank length: ", len(pageranks) print for page in pageranks.keys(): if page in self._documents.keys(): print self._documents[page].title, page, pageranks[page] self._documents[page].pagerank = pageranks[page]
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() self._update_inverted_index() print " url=" + repr(self._curr_url) except urllib2.URLError as e: print e pass finally: if socket: socket.close() pr = pagerank.page_rank([(elem.from_doc.id, elem.to_doc.id) for elem in Links.select()]) for doc_id, rank_score in pr.iteritems(): print doc_id, rank_score Documents.update(page_rank=rank_score).where( Documents.id == doc_id).execute()
def pagerank_score_degre_entrant(n=20): G = core.construire_G(n) vec = pr.page_rank(G) vec = [int(10000 * np.real(x)) / 10000 for x in vec] degres = np.sum(G, axis=1).astype(int) dic = {} for i in range(len(degres)): d = degres[i] if not d in dic: dic[d] = [] dic[d].append(vec[i]) dic = {d: np.average(dic[d]) for d in dic} X = np.array(list(dic)) Y = np.array([dic[d] for d in X]) i = np.argsort(X) plt.plot(X[i], Y[i], marker='o') plt.show()
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [ ] self._index_document(soup) self._add_words_to_document() except Exception as e: print e pass finally: if socket: socket.close() rank = None if (self._link_db) : rank = pr.page_rank(self._link_db) self.update_db(rank)
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() except Exception as e: print e pass finally: if socket: socket.close() rank = None if (self._link_db): rank = pr.page_rank(self._link_db) self.update_db(rank)
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [ ] self._index_document(soup) self._add_words_to_document() self._update_inverted_index() print " url="+repr(self._curr_url) except urllib2.URLError as e: print e pass finally: if socket: socket.close() pr = pagerank.page_rank([(elem.from_doc.id, elem.to_doc.id) for elem in Links.select()]) for doc_id, rank_score in pr.iteritems(): print doc_id, rank_score Documents.update(page_rank=rank_score).where(Documents.id==doc_id).execute()
def get_page_rank(self): link_page_rank = page_rank(self._links) # inv_dict = dict((v,k) for k, v in self._links_dict.iteritems()) # for l_id, rank in link_page_rank.iteritems(): # link_url = inv_dict[l_id] # if link_url in self._doc_id_cache: # doc_id = self._doc_id_cache[link_url] # self._page_rank[doc_id] = rank # for d_url in self._doc_id_cache: # doc_id = self._doc_id_cache[d_url] # if doc_id not in self._page_rank: # self._page_rank[doc_id] = 0 for doc_id, rank in link_page_rank.iteritems(): self._page_rank[doc_id] = rank for url, doc_id in self._doc_id_cache.iteritems(): if doc_id not in self._page_rank: self._page_rank[doc_id] = 0
def test_page_rank(): test_crawler = crawler(None, "") # test values DOC_ID_A = 1 DOC_ID_B = 2 DOC_ID_C = 3 DOC_ID_D = 4 # Initialize crawler needed for the function call test_crawler.add_link(DOC_ID_A, DOC_ID_B) test_crawler.add_link(DOC_ID_B, DOC_ID_D) test_crawler.add_link(DOC_ID_D, DOC_ID_C) test_crawler.compute_page_rank() # Expected and actual result comparison expected_result = page_rank([(DOC_ID_A,DOC_ID_B), (DOC_ID_B, DOC_ID_D), (DOC_ID_D, DOC_ID_C)]) actual_result = test_crawler._page_rank # If the two results equal return true if cmp(expected_result, actual_result) == 0: return True else: return False
def update_page_rank(self): """ The function will insert all the links relations from the url_list file into DB :return: """ rank_dict = page_rank(self._in_out_links) print self._in_out_links with self._db_conn: c = self._db_conn.cursor() # for every document that we have crawled for doc_id in self._doc_id_cache.values(): # if there is a rank for this document if doc_id in rank_dict: _rank = rank_dict[doc_id] # if there is no rank for this document, meaning nothing links to the page else: _rank = 0 # if there was no such entry for DocId then this will create the rank entry # but there was an entry for DocId then nothing will be updated c.execute("INSERT OR IGNORE INTO PageRank (DocId, rank) VALUES (?,?)", (doc_id, _rank)) # Here we make sure ranks will be update-to-date even if they existed before c.execute("UPDATE PageRank SET Rank=? WHERE DocId=?", (_rank, doc_id))
# Testing of the inverted and resolved indexes # just print out their values #document_index = bot.get_document_index() document_index = bot.get_document_index_dict() #print "\nDocument Index\n~~~~~~~~~~~~~~\n", document_index lexicon = bot.get_lexicon() #print "\nLexicon\n~~~~~~~\n", lexicon inverted_index = bot.get_inverted_index() #print "\nInverted Index\n~~~~~~~~~~~~~~\n", inverted_index resolved_inverted_index = bot.get_resolved_inverted_index() #print "\nResolved Index\n~~~~~~~~~~~~~~\n", resolved_inverted_index # LAB 3 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # compute page ranks of crawled pages pagerank_dict = page_rank(bot.get_list_of_links(), num_iterations=20) # make an upgraded resolved_inverted_index # maps word strings to ordered list of tuples # tuples = (url string, page title, page rank score) word_to_sorted_list_of_urls = {} for word, url_set in resolved_inverted_index.items(): # combine each url with its page rank into a tuple # sort that list of tuples by page rank # store new sorted list of url tuples into new data structure newlist = [] for a_url in url_set: a_url_id = bot.document_id(a_url) a_url_rank = pagerank_dict[a_url_id] a_url_title = document_index[a_url_id][0] newlist.append((a_url, a_url_title, a_url_rank))
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [ ] self._index_document(soup) self._add_words_to_document() print " url="+repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() #print self._from_to_id #print self._index_cache z=open('title.txt','r+') for x,y in self._title_id_cache.iteritems(): x=x.replace("u'",'') a = str(y) + ' , ' + x + '\n' z.write(a) z.close() g=open('dbwords.txt','r+') for x,y in self._index_cache.iteritems(): b = str(x) + ',' + str(y) + '\n' g.write(b) g.close() h=open('dbdocs.txt','r+') s1="" for x,y in self._doc_id_cache.iteritems(): if "u'" in repr(x): s1= repr(x).replace("u'"," ",1) if 'u"' in repr(x): s1= repr(x).replace('u"'," ",1) s2=s1.replace("'","") s3=s2.replace('"',"") c = str(y)+ ' , '+ s3 + '\n' h.write(c) h.close() pagerank_results=pagerank.page_rank(self._from_to_id) #print pagerank_results f=open('prresults.txt','r+') for x,y in pagerank_results.iteritems(): d = str(x) + ',' + str(y) + '\n' f.write(d) f.close()
from crawler import crawler from pagerank import page_rank import pprint if __name__ == "__main__": bot = crawler(None, "urls.txt") bot.crawl(depth=1) # print "bot.links: " # print bot.links pageranks = page_rank( bot.links) #calculates the page rank score and stores the data on disk pprint.pprint(pageranks)
def _calculate_page_rank(self): return page_rank(self.links_by_doc_id)
def get_page_ranks(self): return dict(pagerank.page_rank(self._url_pairs))
def main(): ''' The menu shown at the beginning of the Page Rank program. It asks whether the file is a csv or snap file, then the file name. Ex: > python lab3.py [-w] CSC 466: Lab 3 - Page Rank & Link Analysis Parse: 1) csv 2) snap (User enters 1 or 2) File name: (User enters file name here) There is an optional flag '-w' that is used for the Football csv. The program outputs every 1000 lines (to ensure that it's parsing) and then at the end of the page rank algorithm, print out the top 20 nodes and how long it took to calculate page rank. Note: -w doesn't quite work at the moment. Please ignore it for now. ''' is_weighted = False # Used for '-w' flag # Setting variable if '-w' is used if len(sys.argv) > 1: if sys.argv[1] == '-w': is_weighted = True # Menu print('CSC 466: Lab 3 - PageRank & Link Analysis') parse_menu = raw_input('Parse:\n' + '1) csv\n' + '2) snap\n' ) file_name = raw_input('File name: ') # PARSING - CSV Files # Note: The algorithm is the same, just parsing is different. if parse_menu == '1': print('Parsing/Creating Graph...') start = time.time() # Tracking time # Parses a csv file and returns a tuple (list, dictionary, dictionary) if is_weighted == False: (nodes, out_degrees, in_degrees) = parser.parse_csv(file_name) else: (nodes, out_degrees, in_degrees) = parser.parse_weighted_csv(file_name) end = time.time() print('Parse/Graph Set-up Time: ' + str(end - start) + ' seconds') # Sets up page rank structures pagerank.set_up(nodes, out_degrees, in_degrees) # PAGE RANKING print('Page Ranking...') start = time.time() num_iters = pagerank.page_rank(0) # Stores # of page rank iterations end = time.time() # Statistics print('Page Rank Time: ' + str(end-start) + ' seconds') print('Page Rank Iterations: ' + str(num_iters)) # PARSING - SNAP Files elif parse_menu == '2': print('Parsing/Creating Graph...') start = time.time() # Tracking time # Parses a SNAP file and returns a tuple (list, dictionary, dictionary) (nodes, out_degrees, in_degrees) = parser.parse_snap(file_name) end = time.time() print('Parse/Graph Set-up Time: ' + str(end-start) + 'seconds') # Sets up page rank structures pagerank.set_up(nodes, out_degrees, in_degrees) # PAGE RANKING print('Page Ranking...') start = time.time() num_iters = pagerank.page_rank(0) # Stores # of page rank iterations end = time.time() # Statistics print('Page Rank Time: ' + str(end-start) + ' seconds') print('Page Rank Iterations: ' + str(num_iters)) # Wrong input else: print('Invalid input - exiting')
from crawler import crawler from pagerank import page_rank # Get crawler object and crawl on urls found in urls.txt crawler = crawler(None, 'urls.txt') crawler.crawl() document_index = crawler.get_document_index() # Run pagerank on the links generated by the crawler pagerank = page_rank(crawler._links) for doc_id, rank in sorted(pagerank.iteritems(), key=lambda (k,v): (v,k), reverse=True): document = crawler._document_index[doc_id] print str(rank) + " : " + str(document[0]) + "\n"
def get_raw_page_rank(self): if len(self._url_pairs) > 0: self._page_rank = pagerank.page_rank(self._url_pairs) return self._page_rank
# call crawler with depth 1 # this will populate our database with all relavent information bot = crawler(con, "urls.txt") bot.crawl(depth=1) # extract all links from database with con: cur = con.cursor() cur.execute("SELECT * FROM link") con.commit() links = cur.fetchall() # calculate the page rank links page_rank = page_rank(links) # convert page_rank dict into a list of tuples page_rank_tuple=page_rank.items() # update the document table in the database with page ranks with con: cur = con.cursor() for x,y in page_rank_tuple: cur.execute("UPDATE document SET page_rank = ? WHERE id= ?", (float(y),int(x))) cur.execute("UPDATE url_list SET page_rank = ? WHERE doc_id= ?", (1000*float(y),int(x))) con.commit() # find max links """
def compute_page_rank(self): self._page_rank = page_rank(self._page_rank_list)
def _insert_pagerank(self): """Insert generated score for each page or link to database PageRank""" if len(self._link_list) > 0: _ranked_list = pagerank.page_rank(self._link_list) self._db.put_pageranks(_ranked_list)
def insert_pagerank_to_db(self): """ Insert rankings of pages/documents to database""" if len(self._links_cache) > 0: rankings = page_rank(self._links_cache) for doc_id, doc_rank in rankings.iteritems(): self._db_cursor.execute('INSERT INTO PageRank(doc_id, doc_rank) VALUES (%d, %f);' % (doc_id, doc_rank) )
import crawler import pagerank import redis import json import pprint import os bot = crawler.crawler(None, "urls.txt") bot.crawl(depth=1) page_rank = pagerank.page_rank(bot._links) inverted_index = bot.get_inverted_index() resolved_inverted_index = bot.get_resolved_inverted_index() lexicon = bot._doc_id_cache url_lexicon = bot._url_lexicon # convert dictionary to json string data and convert all sets in values to list form json_page_rank = json.dumps(dict(page_rank)) json_inverted_index = {k: list(v) for k, v in inverted_index.items()} json_inverted_index = json.dumps(json_inverted_index) json_resolved_inverted_index = { k: list(v) for k, v in resolved_inverted_index.items() } json_resolved_inverted_index = json.dumps(json_resolved_inverted_index) json_lexicon = json.dumps(dict(lexicon)) # store all data to redis database # redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) # redis_db.set('page_rank',json_page_rank) # redis_db.set('inverted_index',json_inverted_index) # redis_db.set('resolved_inverted_index',json_resolved_inverted_index)
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] # keep track of word location for current docId self._curr_wordIndex = 0 self._index_document(soup, doc_id) self._add_words_to_document() self._add_first_p_to_document(soup) print " url=" + repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() # After crawling, save all data to database # save lexicon data to presistent storage # lexicon: wordId (INTEGER PRIMARY KEY), word (TEXT) self.cur.execute(''' CREATE TABLE IF NOT EXISTS lexicon (wordId INTEGER PRIMARY KEY, word TEXT); ''') lexiconData = [(int(wordId), word) for word, wordId in self._word_id_cache.items()] self.cur.executemany( ''' INSERT INTO lexicon VALUES (?,?) ''', lexiconData) self.db_conn.commit() # save documentId data to presistent storage # documentId: docId (INTEGER PRIMARY KEY), url (TEXT) self.cur.execute(''' CREATE TABLE IF NOT EXISTS documentId (docId INTEGER PRIMARY KEY, url TEXT); ''') documentIdData = [(int(docId), str(url)) for url, docId in self._doc_id_cache.items()] self.cur.executemany( ''' INSERT INTO documentId VALUES (?,?) ''', documentIdData) self.db_conn.commit() # save invertedId data to presistent storage # invertedId: wordId (INTEGER), docId (INTEGER) self.cur.execute(''' CREATE TABLE IF NOT EXISTS invertedId (wordId INTEGER, docId INTEGER); ''') invertedIdData = [] for wordId in self._inverted_index.keys(): for docId in self._inverted_index[wordId]: invertedIdData.append((int(wordId), int(docId))) self.cur.executemany( ''' INSERT INTO invertedId VALUES (?,?) ''', invertedIdData) self.db_conn.commit() # save pageRankScores data to presistent storage # pageRankScores: docId (INTEGER), score (REAL) pageRankScores = pagerank.page_rank(self._from_to_links) self.cur.execute(''' CREATE TABLE IF NOT EXISTS pageRankScores (docId INTEGER, score REAL); ''') pageRankScoresData = [(int(docId), float(score)) for docId, score in pageRankScores.items()] unscoredLinks = [(int(docId), float(0.0)) for docId in self._doc_id_cache.values() if docId not in pageRankScores.keys()] self.cur.executemany( ''' INSERT INTO pageRankScores VALUES (?,?) ''', pageRankScoresData) self.cur.executemany( ''' INSERT INTO pageRankScores VALUES (?,?) ''', unscoredLinks) self.db_conn.commit() # save docTitle data to presistent storage # docTitle: docId (INTEGER), title (TEXT) self.cur.execute(''' CREATE TABLE IF NOT EXISTS docTitle (docId INTEGER, title TEXT); ''') docTitles = [(int(docId), str(title)) for docId, title in self._doc_title_cache.items()] self.cur.executemany( ''' INSERT INTO docTitle VALUES (?,?) ''', docTitles) self.db_conn.commit() # save docWordHits data to presistent stroage # docWordHits: docId (INTEGER), wordId (INTEGER), fontSize (INTEGER), wordLocation (INTEGER) self.cur.execute(''' CREATE TABLE IF NOT EXISTS docWordHits (docId INTEGER, wordId INTEGER, fontSize INTEGER, wordLocation INTEGER); ''') wordHits = [] for docId in self._doc_wordHits_cache.keys(): for hit in self._doc_wordHits_cache[docId]: wordHits.append( (int(docId), int(hit[0]), int(hit[1]), int(hit[2]))) self.cur.executemany( ''' INSERT INTO docWordHits VALUES (?,?,?,?) ''', wordHits) self.db_conn.commit() # save docAnchorHits data to presistent stroage # docAnchorHits: docId (INTEGER), wordId (INTEGER), anchorFontSize (INTEGER) self.cur.execute(''' CREATE TABLE IF NOT EXISTS docAnchorHits (docId INTEGER, wordId INTEGER, anchorFontSize INTEGER); ''') anchorHits = [] for docId in self._doc_anchorHits_cache.keys(): for hit in self._doc_anchorHits_cache[docId]: anchorHits.append((int(docId), int(hit[0]), int(hit[1]))) self.cur.executemany( ''' INSERT INTO docAnchorHits VALUES (?,?,?) ''', anchorHits) self.db_conn.commit() # save docSnippet data to presistent stroage # docSnippet: docId (INTEGER), snippet (TEXT) self.cur.execute(''' CREATE TABLE IF NOT EXISTS docSnippet (docId INTEGER, snippet TEXT); ''') docSnippets = [(docId, snippet) for docId, snippet in self._doc_snippet_cache.items()] self.cur.executemany( ''' INSERT INTO docSnippet VALUES (?,?) ''', docSnippets) self.db_conn.commit()
def rank_page(self): self._page_ranks = pagerank.page_rank(self._link_cache.keys())
def compute_page_rank(self): """Call the page rank function with the _page_rank_list input to compute the score for each doc""" self._page_rank = page_rank(self._page_rank_list)