def test_generate_html(self): index = { 'http://www.davidcmoss.co.uk/': Page(title=['David C Moss'], links=set([ u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf' ]), ex_links=set(['https://www.heroku.com']), images=['/static/img/profile.jpeg']), 'http://www.davidcmoss.co.uk/static/Curriculum%20Vitae.pdf': Page(title=[], links=set([]), ex_links=set([]), images=[]) } html = generate_html(index) assert "https://www.heroku.com" in html
def test_crawl_site(self): index = crawl_site('http://www.davidcmoss.co.uk') expected_response = { u'http://www.davidcmoss.co.uk/': Page(title=u'David C Moss', links=set([ u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf' ]), ex_links=['https://www.heroku.com'], images=[ u'http://www.davidcmoss.co.uk/static/img/profile.jpeg' ]), u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf': Page(title=[], links=set([]), ex_links=[], images=[]) } assert index == expected_response
def build_crawl_list(self): """ Build a list of all of the URLs based on the depth specified. """ current_depth = 1 page = requests.get(self.base_url).text if self.children > 0: self.urls = Page.get_urls(page)[:self.children] else: self.urls = Page.get_urls(page) # Below list holds previously scanned URLs, to stop URLs being added twice scanned_urls = [] while current_depth <= self.depth: # Append the links for each page then search it for more print 'Starting crawl depth', current_depth, 'with', len(self.urls), 'URLs to scan' new_urls = [] for url in self.urls: # If the url is not already scanned, and if it is not an image, xml etc. scan it. if url not in scanned_urls: if TasteDotCom.is_wanted_object(url): print 'Looking for child URLs in ', url markup = requests.get(url).text scanned_urls.append(url) if self.children > 0: new_urls = Page.get_urls(markup)[:self.children] else: new_urls = Page.get_urls(markup) print 'Found', len(new_urls), 'new pages' # for url in new_urls: # check_and_add(url) self.urls += new_urls current_depth += 1 print 'Finished crawling', self.base_url, 'found', len(self.urls), 'total URLs' # def run(self): # """ # Start Crawling the page specified # """ # #todo Make use of this method # print "Starting crawl session for", self.base_url # page = requests.get(self.base_url).text # child_urls = Page.get_urls(page) # for url in child_urls: # self.check_and_add(url) # def check_and_add(url): # pass
def test_get_urls(self): # Use a known static page for testing html = file("../miscellany/sausage_and_punpkin_curry.html").read() urls = Page.get_urls(html) self.assertEquals(len(urls), 387) a = re.compile(r"http://www\.[//\a\w\.\+]+") for url in urls: # Check each URL matches a hyperlink pattern self.assertTrue(a.match(url))
import network2 as nx import pylab as plt from crawler import Crawler, Page, Document, Corpus if __name__ == '__main__': start_page = Page('http://info.cern.ch/hypertext/WWW/TheProject.html') crawler = Crawler(start_page) crawler.crawl() web_graph = nx.DiGraph() edges = [] edges2 = [] for page in crawler.web: for link in page.links: edges.append((hash(page.address), hash(link))) web_graph.add_edges_from(edges) nx.draw(web_graph) plt.show() pageRanks = nx.pagerank(web_graph) for page in crawler.web: page.page_rank = pageRanks[hash(page.address)] pages = sorted(crawler.web, key=lambda p: p.page_rank, reverse=True) corpus = [] for page in pages: corpus.append((page.address, page.text))