示例#1
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
示例#2
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
示例#3
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
示例#4
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
    def test_that_content_type_restriction_works(self):
        xtor = DefaultHtmlLinkExtractor(Settings())

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        self.assertTrue(xtor._restrict_content_type(curi))
        curi.rep_header["Content-Type"] = "pille/palle"
        self.assertFalse(xtor._restrict_content_type(curi))
    def test_that_content_type_restriction_works(self):
        xtor = DefaultHtmlLinkExtractor(Settings())

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        self.assertTrue(xtor._restrict_content_type(curi))
        curi.rep_header["Content-Type"] = "pille/palle"
        self.assertFalse(xtor._restrict_content_type(curi))
示例#7
0
    def test_only_on_redirect(self):

        s = Settings()

        curi = CrawlUri("http://localhost")
        curi.status_code = 200
        curi.rep_header = {"Location": "http://localhost/index.html"}
        curi.optional_vars = dict()

        xtor = HttpExtractor(s)
        curi = xtor(curi)

        self.assertFalse(CURI_EXTRACTED_URLS in curi.optional_vars)
示例#8
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = {
                "Etag": "123%s" % i,
                "Date": serialize_date_time(now)
            }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date,
                 prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri(
                    (url, etag, mod_date, next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
示例#9
0
    def test_relative_links(self):

        s = Settings()

        curi = CrawlUri("http://localhost")
        curi.status_code = 303
        curi.rep_header = {"Location": "/index.html"}
        curi.optional_vars = dict()

        xtor = HttpExtractor(s)
        curi = xtor(curi)

        self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars)
        self.assertEquals("http://localhost/index.html", curi.optional_vars[CURI_EXTRACTED_URLS])
示例#10
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri((url, etag, mod_date,
                            next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
    def test_missing_encoding_works(self):
        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
    def test_missing_encoding_works(self):
        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                links[2])
示例#13
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
    def test_link_extraction_works(self):

        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'/>" + \
            "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
示例#15
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
    def test_link_extraction_works(self):

        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'/>" + \
            "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                links[2])
示例#17
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
示例#18
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri