def crawl(self): # site wide checks self._analyze_crawlers() self._analyze_mobile() self._analyze_analytics() # iterate over individual pages to crawl for page_url in self.pages_to_crawl: print("Crawled {0} Pages of {1}".format(len(self.pages_crawled), len(self.pages_to_crawl))) resp = requests.get(page_url) if resp.status_code == requests.codes.ok: html = webpage.Webpage(page_url, resp.content, self.titles, self.descriptions) page_report = html.report() self.report['pages'].append(page_report) # mark the page as crawled self.pages_crawled.append(page_url.strip().lower()) elif resp.status_code == requests.codes.not_found: self.warn(WARNINGS["BROKEN_LINK"], page_url) else: self.warn( WARNINGS["SERVER_ERROR"], "HTTP{0} received for {1}".format(resp.status_code, page_url)) # aggregate the site wide issues/achievements self.report['site'] = {} self.report['site']["issues"] = self.issues self.report['site']["achieved"] = self.achieved return self.report
def test_visible_tags(self, data): html = "" self.wp = webpage.Webpage("https://www.drawbuildplay.com", html, self.titles, self.descriptions) soup = self.soup_file(data[0]) elements = soup.findAll(text=True) for tag in elements: result = self.wp.visible_tags(tag) self.assertEqual(result, data[1])
def test_analyze_negative_url(self, data): url = data[0] expected_error = data[1] html = "" self.wp = webpage.Webpage(url, html, self.titles, self.descriptions) self.wp.report() self.assertTrue( any(issue["warning"] == WARNINGS[expected_error] for issue in self.wp.issues), "{0} not raised.".format(WARNINGS[expected_error]))
def test_analyze_negative(self, data): html = data[0] expected_error = data[1] self.wp = webpage.Webpage("https://www.drawbuildplay.com", html, self.titles, self.descriptions) self.wp.report() self.assertTrue( any(issue["warning"] == WARNINGS[expected_error] for issue in self.wp.issues), "{0} not raised.".format(WARNINGS[expected_error]))
def test_analyze_positive_url(self, data): url = data[0] badge = data[1] html = "" self.wp = webpage.Webpage(url, html, self.titles, self.descriptions) self.wp.report() if badge != "": self.assertTrue( any(earned["achievement"] == BADGES[badge] for earned in self.wp.achieved), "{0} not earned".format(BADGES[badge]))
def test_analyze_positive(self, data): html = data[0] badge = data[1] self.wp = webpage.Webpage("https://www.drawbuildplay.com", html, self.titles, self.descriptions) self.wp.report() # title should have achieved the following if badge != "": self.assertTrue( any(earned["achievement"] == BADGES[badge] for earned in self.wp.achieved), "{0} not earned".format(BADGES[badge]))
def test_analyze_duplicates_negative(self, page): html = page[0] expected_error = page[1] report = {"pages": []} for i in range(0, 2): self.wp = webpage.Webpage( "https://www.drawbuildplay.com/page{0}.html".format(i), html, self.titles, self.descriptions) page_report = self.wp.report() report['pages'].append(page_report) # warn about duplicate information self.assertTrue( any(issue["warning"] == WARNINGS[expected_error] for p in report['pages'] for issue in p['issues']), "{0} not raised. {1} {2}".format(WARNINGS[expected_error], self.titles, self.descriptions))