def populate_inlinks_dict(lines): try: for line in lines: words = line.split() M[words[0]] = words[1:] # populating the inlinks dictionary except Exception as e: logerror(e)
def start(): try: inlinks_file_path = raw_input( "> Enter the path to the inlink graph file with proper extension. \n>\t" ) inlinks_file = open(inlinks_file_path, "r") lines = inlinks_file.readlines() populate_inlinks_dict(lines) populate_outlinks_dict() populate_sink_pages() populate_page_rank() calc_page_rank() sort_pages_rank() sort_pages_inlink() print "=================================================================" print "========================STATISTICS=============================== \n" sinks = number_of_sinks() sources = number_of_sources() total_pages = len(M.keys()) print("Total number of pages = " + str(total_pages)) print("Proportion of Sinks = " + str(sinks / float(len(M.keys())))) print("Proportion of Sources = " + str(sources / float(len(M.keys())))) print "=================================================================" print "=================================================================" except Exception as e: logerror(e)
def fetch_all_links(current_crawl, keyword): base_url = "https://en.wikipedia.org" pattern = re.compile('^/wiki/') new_links_list = [] try: html_content = get_html_content(current_crawl) #stores all the links within the bodyContent of the page excluding the administrative links #and the 1st regex excludes any links containing # or any other links starting other than wiki links = html_content.find("div", { "id": "bodyContent" }).find_all('a', href=pattern) for link in links: if ":" not in link.get('href'): url = urlparse.urljoin(base_url, link.get('href')) if "#" in link.get('href'): url = url[:url.index('#')] anchor = link.text.encode("utf-8") match = re.search(r'.*{0}.*'.format(keyword), url, re.I) keywordsearch = re.search(r'.*{0}.*'.format(keyword), anchor, re.I) if (url not in new_links_list) and (match or keywordsearch): new_links_list.append(url) LINK_ANCHOR.update({url: anchor}) except Exception as e: logerror(e) return new_links_list
def cal_perplexity(): try: entropy = 0.0 # initiating the value of entropy to 0.0 for p in PR.keys(): entropy -= PR[p] * log(PR[p], 2) return 2**entropy except Exception as e: logerror(e)
def populate_sink_pages(): try: for p in M.keys(): # iterating through all the unique crawled pages if not L.has_key( p ): # checking whether the particular page p has any outlink S.append(p) except Exception as e: logerror(e)
def populate_page_rank(): try: total_pages = float( len(M.keys()) ) # making it float because the value will be required to calculate PR for p in M.keys(): PR[p] = 1.0 / total_pages #Initiating the value of page rank for each page as 1/total number of pages except Exception as e: logerror(e)
def number_of_sources(): try: c = 0 for p in M: if not M[p]: c += 1 print("Number of Sources: " + str(c)) return float(c) except Exception as e: logerror(e)
def populate_outlinks_dict(): try: for p in M.keys(): # iterating through all the unique crawled pages for q in M.get( p): # iterating through the list of inlinks for p page if L.has_key( q ): # checking whether the particular inlink is already in outlink dict L[q] += 1 # if the particular page q already exists in outlink dict, then increment the counter else: L[q] = 1 # if the particular page q does not exist in outlink dict, then initiate the counter from 1 except Exception as e: logerror(e)
def sort_pages_rank(): try: spr = {} spr = sorted(PR.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_file = open("SortedPage.txt", "a") if (len(spr) < 50): for sp in range(len(spr)): sorted_file.write(str(spr[sp]) + "\n") else: for sp in range(50): sorted_file.write(str(spr[sp]) + "\n") sorted_file.close() except Exception as e: logerror(e)
def sort_pages_inlink(): try: inlink_rank = {} for p in M: inlink_rank[p] = len(M.get(p)) inlink_rank = sorted(inlink_rank.iteritems(), key=operator.itemgetter(1), reverse=True) inlink_file = open("InlinkPageRank.txt", "a") for ir in range(5): inlink_file.write(str(inlink_rank[ir]) + " \n") inlink_file.close() except Exception as e: logerror(e)
def get_html_content(url): try: html = urllib2.urlopen(url) content = BeautifulSoup(html,"html.parser") content.prettify() if LINK_ANCHOR.has_key(url): file_name= re.sub(r'[\W]', '_', LINK_ANCHOR[url])#replacing space from anchor name else: file_name= re.sub(r'[\W]', '_', url)#replacing special characters LINK_FILENAME.update({url:file_name+".html"}) out_file = open(CRAWLED_HTML_PATH+"\\"+file_name+".html",'w') out_file.write(url.encode('UTF-8')+"\n"+content.prettify().encode('UTF-8')) out_file.close() html.close() return content except Exception as e: logerror(e)
def start(): try: remove_files() depth = raw_input("> Enter the depth where depth starts from 1 \n>\t") crawled_limit = raw_input("> Enter the number limit of crawled urls\n>\t") seed_url="https://en.wikipedia.org/wiki/Sustainable_energy" crawledlist = web_crawl(seed_url,int(depth),int(crawled_limit)) if crawledlist: out_file = open(CRAWLEDLISTPATH+"\crawled_list.txt",'a') count=1 for url in crawledlist: out_file.write(str(count)+".\t" +url + "\n") count+=1 print "The crawled_list.txt can be found in %s" %(CRAWLEDLISTPATH) print "The crawled list of htmls can be found in %s" %(CRAWLED_HTML_PATH) out_file.close() print "The error file can be found at %s" %(ERROR_FILE_PATH) except Exception as e: logerror(e)
def web_crawl(seed, max_depth,crawled_limit): frontier_crawl = [seed]#this list maintains visited = [] next_depth_urls = [] #this list stores all unique urls for each depth depth = 1 #where seedpage is depth 1. try: while frontier_crawl and depth <= max_depth and len(visited) < crawled_limit: current_crawl = frontier_crawl.pop(0) #crawls the links from the top of the page if current_crawl not in visited: new_url_links = fetch_all_links(current_crawl) if new_url_links is not None: merge_results(next_depth_urls, new_url_links) visited.append(current_crawl) time.sleep(1) #waiting policy for 1 second. if not frontier_crawl: frontier_crawl, next_depth_urls = next_depth_urls, [] #once frontrier or current depth list is empty, going to next depth. depth += 1 except Exception as e: logerror(e) return visited return visited
def calc_page_rank(): try: # total_pages is the total number of unique crawled pages total_pages = float( len(M.keys()) ) # making it float because the value will be required to calculate PR # d is the dammping factor/teleportation factor d = 0.85 perplexity = 0.0 #initial value of perplexity convergence_count = 0 #initializing the convergence_count to 0 iteration_count = 0 # will track the total number of iterations required to converge while convergence_count < 4: sinkPR = 0.0 for p in S: sinkPR += PR[p] for p in M.keys(): NEWPR[p] = (1.0 - d) / total_pages #teleportation factor NEWPR[p] += (d * sinkPR / total_pages ) # spreading the remaining sinkPR evenly for q in M[p]: #traversing through the inlinks of page p NEWPR[p] += d * PR[q] / L[ q] # add share of PageRank from inlinks for page in M.keys(): PR[page] = NEWPR[page] # setting the new PageRank new_perplexity = cal_perplexity() if abs(new_perplexity - perplexity) < 1.0: convergence_count += 1 else: convergence_count = 0 perplexity = new_perplexity iteration_count += 1 outfile = open("perplexity_per_round.txt", "a") outfile.write("Perplexity value: " + str(perplexity) + " for the iteration: " + str(iteration_count) + "\n") outfile.close except Exception as e: logerror(e)
def get_html_content(url): try: counter= 1; html = urllib2.urlopen(url) content = BeautifulSoup(html,"html.parser") content.prettify() article_id = (url.split("/wiki/")[1]) file_name=re.sub(r'-*_*', '', article_id) if file_name not in LINK_FILENAME: LINK_FILENAME.append(file_name) else: while(file_name in LINK_FILENAME): file_name = file_name+str(counter) counter = counter + 1 LINK_FILENAME.append(file_name) out_file = open(CRAWLED_HTML_PATH+"\\"+file_name+".txt",'w') out_file.write(url.encode('UTF-8')+"\n"+content.prettify().encode('UTF-8')) out_file.close() html.close() return content except Exception as e: logerror(e) return None
def web_crawl(url, max_depth, crawled_limit, keyword, visited): current_depth = 1 try: if current_depth <= max_depth and len(visited) < crawled_limit: frontier_crawl = fetch_all_links(url, keyword) if url not in visited: visited.append(url) for new_url in frontier_crawl: if len(visited) < crawled_limit and max_depth > current_depth: if new_url not in visited: merge_results( visited, web_crawl(new_url, max_depth - 1, crawled_limit, keyword, visited)) else: break time.sleep(1) #waiting policy for 1 second. else: return [] except Exception as e: logerror(e) return visited return visited
def number_of_sinks(): try: print("Number of Sinks: " + str(len(S))) return float(len(S)) except Exception as e: logerror(e)