示例#1
0
    def test_parse_sitemap(self, sitemap_content):
        wp = website.Spider(self.site_url, None)

        locations = wp._parse_sitemap(sitemap_content)

        soup = Soup(sitemap_content, "html.parser")
        urls = soup.findAll('url')

        self.assertEqual(len(locations), len(urls))
示例#2
0
    def test_init_sitemap_negative(self, sitemap_content, mock_requests):
        sitemap_url = "/sitemap.xml"

        mock_requests.return_value.status_code = requests.codes.not_found
        mock_requests.return_value.content = sitemap_content

        wp = website.Spider(self.site_url, self.site_url + sitemap_url)

        self.assertTrue(self.site_url in wp.pages_to_crawl)
示例#3
0
    def test_analyze_crawlers(self, resp_code, mock_requests):
        mock_requests.return_value.status_code = int(resp_code)

        wp = website.Spider(self.site_url, None)
        wp._analyze_crawlers()

        if int(resp_code) == requests.codes.ok:
            self.assertTrue(
                any(earned["achievement"] == BADGES["ROBOTS.TXT"]
                    for earned in wp.achieved),
                "{0} not earned".format(BADGES["ROBOTS.TXT"]))
        else:
            self.assertTrue(
                any(issue["warning"] == WARNINGS["ROBOTS.TXT"]
                    for issue in wp.issues),
                "{0} not raised.".format(WARNINGS["ROBOTS.TXT"]))
示例#4
0
    def test_crawl(self, data, mock_requests):
        wp = website.Spider(self.site_url, None)
        wp._analyze_crawlers = mock.MagicMock(name="_analyze_crawlers")

        # set up the mocked responses
        resp_code, content = data.split("|")

        mock_requests.return_value.status_code = int(resp_code)
        mock_requests.return_value.content = content
        wp.crawl()

        if int(resp_code) == requests.codes.ok:
            self.assertEqual(len(wp.issues), 0)

        elif int(resp_code) == requests.codes.not_found:
            self.assertTrue(
                any(issue["warning"] == WARNINGS["BROKEN_LINK"]
                    for issue in wp.issues),
                "{0} not raised.".format(WARNINGS["BROKEN_LINK"]))
        else:
            self.assertTrue(
                any(issue["warning"] == WARNINGS["SERVER_ERROR"]
                    for issue in wp.issues),
                "{0} not raised.".format(WARNINGS["SERVER_ERROR"]))
示例#5
0
    def test_init_url(self):
        wp = website.Spider(self.site_url, None)

        self.assertEqual(len(wp.pages_to_crawl), 1)
        self.assertEqual(wp.pages_to_crawl[0], self.site_url)
示例#6
0
def analyze(domain, sitemap):
    spider = website.Spider(domain, sitemap)
    report = spider.crawl()

    return (json.dumps(report, indent=4, separators=(',', ': ')))