Exemplo n.º 1
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True)

        target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html"
        print(crawler.graph[target_url])
        self.assertIsNotNone(crawler.graph[target_url])
Exemplo n.º 2
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')
Exemplo n.º 3
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1", None,
                      True)

        url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg"
        self.assertEqual(crawler.graph[url].request_type, "head")
Exemplo n.º 4
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')

        self.assertIn(
            "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg"]
            .request_type, 'head')
Exemplo n.º 5
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/test4/", None,
            True)

        self.assertTrue(
            "https://triplebyte.github.io/web-crawler-test-site/test4/page3" in
            crawler.graph.nodes)
Exemplo n.º 6
0
    def test_challenge(self):
        # The bug here is that the crawler will hang. Don't sit around waiting
        # for it to finish!
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test3/", None, True)

        self.assertIn(
            "http://blah.com:7091",
            crawler.graph.nodes
        )
    def test_crawling_triplebyte(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl("https://www.triplebyte.com", None, True)

        self.assertIn("https://www.triplebyte.com", crawler.graph.nodes)

        self.assertIn("https://triplebyte.com/careers", crawler.graph.nodes)

        self.assertEqual(
            crawler.graph.nodes["http://www.olark.com?welcome"].request_type,
            "head")
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("target")
    parser.add_argument("--number_of_threads")
    parser.add_argument("--output_file")
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true")

    args = parser.parse_args()

    webcrawler = WebCrawler(
        args.number_of_threads or 5,
        args.verbose and loggers.VerboseCrawlerLogger
        or loggers.SilentCrawlerLogger)

    webcrawler.crawl(args.target, args.output_file)