Exemplo n.º 1
0
    def test_http_counts_as_internal_link(self):
        self.svc.get('requests')._expect("https://example.com", 200, '<a href="http://example.com/insecure">click here</a>')
        self.svc.get('requests')._expect("http://example.com/insecure", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "https://example.com")
        siteMap = crawler.map()
        self.assertEqual({"https://example.com":{"assets":[], "links":["http://example.com/insecure"]},
                          "http://example.com/insecure": {"assets": [], "links": []}}, siteMap)
Exemplo n.º 2
0
    def test_query_params_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="/?foo=bar">click here</a>')
        self.svc.get('requests')._expect("http://example.com/?foo=bar", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["/?foo=bar"]},
                          "http://example.com/?foo=bar": {"assets": [], "links": []}}, siteMap)
Exemplo n.º 3
0
    def test_disallowed_urls_are_not_fetched(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com/admin">click here</a>')
        self.svc.get('RobotFileParser')._disallowed_urls['http://example.com/admin'] = True

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com/admin"]},
                         "http://example.com/admin": {"error": "Disallowed by robots.txt"}}, siteMap)
Exemplo n.º 4
0
    def test_relative_links_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="foobar/">click here</a>')
        self.svc.get('requests')._expect("http://example.com/foobar/", 200, '')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["foobar/"]},
                          "http://example.com/foobar/": {"assets": [], "links": []}}, siteMap)
Exemplo n.º 5
0
    def test_redirects_to_unknown_protocols_are_handled(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="https://www.example.com/foobar">click here</a>',
            finalUrl="foo:bar")

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"error": "Error fetching url: InvalidSchema('Unrecognized scheme: foo:bar',)"}}, siteMap)
Exemplo n.º 6
0
    def test_dont_rerequest_a_redirect(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="http://example.com">click here</a>',
            finalUrl="http://example.com/foo")

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com/foo":{"original_url": "http://example.com", "assets":[], "links":["http://example.com"]},
                          "http://example.com": {"redirects_to": "http://example.com/foo"}}, siteMap)
Exemplo n.º 7
0
    def test_redirects_are_handled_nicely(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="https://www.example.com/foobar">click here</a>',
            finalUrl="https://www.example.com")
        self.svc.get('requests')._expect("https://www.example.com/foobar", 200, '')

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"https://www.example.com":{"original_url": "http://example.com", "assets":[], "links":["https://www.example.com/foobar"]},
                          "https://www.example.com/foobar": {"assets": [], "links": []},
                          "http://example.com": {"redirects_to": "https://www.example.com"}}, siteMap)
Exemplo n.º 8
0
def main():
    domain = sys.argv[1]
    svc = RealServiceProvider()
    crawler = Crawler(svc, domain)
    print("Mapping...")
    siteMap = crawler.map(verbose=True)
    print("Complete.")
    print()
    print("SiteMap:")
    for url, data in siteMap.items():
        print(url)

        for prop in ["error", "original_url", "redirects_to"]:
            if data.get(prop, False):
                print("  %s: %s" % (prop, data[prop]))

        for prop in ["assets", "links"]:
            print("  %s:" % (prop,))
            for p in data.get(prop, []):
                print("    %s" % (p,))
Exemplo n.º 9
0
    def test_subdomain_doesnt_count_as_internal(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://api.example.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://api.example.com"]}}, siteMap)
Exemplo n.º 10
0
    def test_leaving_out_scheme_in_domain_is_fine(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com">click here</a>')

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com"]}}, siteMap)
Exemplo n.º 11
0
    def test_recursive_links_dont_cause_re_fetch(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com"]}}, siteMap)
Exemplo n.º 12
0
    def test_external_links_are_not_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://foobar.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://foobar.com"]}}, siteMap)
Exemplo n.º 13
0
    def test_link_href_is_captured_as_asset(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<link href="/css/foo.css"></script>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":["/css/foo.css"], "links":[]}}, siteMap)
Exemplo n.º 14
0
    def test_image_src_is_captured_as_asset(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<img src="/img/foo.png">')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":["/img/foo.png"], "links":[]}}, siteMap)
Exemplo n.º 15
0
    def test_error_urls_are_noted(self):
        self.svc.get('requests')._expect("http://example.com", 400, 'Error: bad request')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"error": "Error fetching url. Response code: 400"}}, siteMap)