Пример #1
0
def single_node_chain(url1, url2):
    r = Request(url=url1)
    re = Response(url=url2, request=r)
    re.meta[b'fingerprint'] = sha1(url2)
    re.meta[b'redirect_urls'] = [url1]
    re.meta[b'redirect_fingerprints'] = [sha1(url1)]
    return re
Пример #2
0
def single_node_chain(url1, url2):
    r = Request(url=url1)
    re = Response(url=url2, request=r)
    re.meta[b'fingerprint'] = sha1(url2)
    re.meta[b'redirect_urls'] = [url1]
    re.meta[b'redirect_fingerprints'] = [sha1(url1)]
    return re
def test_basic():
    cs = Basic()
    r = Request(url="http://www.scrapinghub.com/")

    re = Response(url="http://scrapinghub.com/", request=r)
    re.meta["fingerprint"] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e"
    re.meta["redirect_urls"] = ["http://www.scrapinghub.com/"]
    re.meta["redirect_fingerprints"] = ["6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66"]
    cs.page_crawled(re, [])
    assert re.url == "http://www.scrapinghub.com/"
def test_basic():
    cs = Basic()
    r = Request(url="http://www.scrapinghub.com/")

    re = Response(url="http://scrapinghub.com/", request=r)
    re.meta['fingerprint'] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e"
    re.meta['redirect_urls'] = ['http://www.scrapinghub.com/']
    re.meta['redirect_fingerprints'] = [
        "6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66"
    ]
    cs.page_crawled(re, [])
    assert re.url == "http://www.scrapinghub.com/"
Пример #5
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(3)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(3, 5)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
 def test_page_crawled(self):
     mbb = self.mbb_setup()
     resp = Response(r1.url, body='body', request=r1)
     mbb.page_crawled(resp)
     page = mbb._decoder.decode(mbb.spider_log_producer.messages[0])[1]
     self.assertEqual((page.request.url, page.body),
                      (resp.request.url, 'body'))
Пример #7
0
 def test_page_crawled(self):
     dbw = self.dbw_setup()
     resp = Response(r1.url, request=r1)
     msg = dbw._encoder.encode_page_crawled(resp)
     dbw.spider_log_consumer.put_messages([msg])
     dbw.consume_incoming()
     assert set([r.url for r in dbw._backend.responses]) == set([r1.url])
Пример #8
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Пример #9
0
 def test_page_crawled(self):
     fm = self.setup_frontier_manager()
     response = Response(r1.url, request=r1)
     fm.page_crawled(response)
     assert fm.backend.responses.pop() == response
     assert [mw.responses.pop() for mw in fm.middlewares] == [response] * 4
     assert fm.canonicalsolver.responses.pop() == response
     assert response.meta[b'test_response'] == 'test'
Пример #10
0
 def test_page_crawled(self):
     dbw = self.dbw_setup()
     resp = Response(r1.url, request=r1)
     msg = dbw._encoder.encode_page_crawled(resp)
     incoming_consumer = dbw.slot.components[IncomingConsumer]
     incoming_consumer.spider_log_consumer.put_messages([msg])
     incoming_consumer.run()
     assert set([r.url for r in incoming_consumer.backend.responses
                 ]) == set([r1.url])
Пример #11
0
 def test_links_extracted(self):
     fm = self.setup_frontier_manager()
     response = Response(r1.url, request=r1)
     fm.links_extracted(r1, links=[r2, r3])
     assert set([link for link in fm.backend.links]) == set([r2, r3])
     assert set([link
                 for link in fm.canonicalsolver.links]) == set([r2, r3])
     assert [set([link for link in mw.links])
             for mw in fm.middlewares] == [set([r2, r3])] * 4
     assert [link.meta[b'test_links'] for link in [r2, r3]] == ['test'] * 2
     assert [
         link.meta[b'test_links_canonical_solver'] for link in [r2, r3]
     ] == ['test'] * 2
Пример #12
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Пример #13
0
 def test_page_crawled(self):
     sw = self.sw
     r1.meta[b'jid'] = 1
     resp = Response(r1.url, request=r1)
     msg = sw._encoder.encode_page_crawled(resp)
     sw.consumer.put_messages([msg])
     sw.work()
     # response should be skipped if it's jid doesn't match the strategy worker's
     assert sw.scoring_log_producer.messages == []
     sw.workflow.job_id = 1
     sw.consumer.put_messages([msg])
     sw.work()
     r1c = r1.copy()
     sw.workflow.states_context.states.set_states(r1c)
     assert r1c.meta[b'state'] == States.CRAWLED
Пример #14
0
def test_codec(encoder, decoder, send_body, invalid_value):
    def check_request(req1, req2):
        assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \
               _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method

    enc = encoder(Request, send_body=send_body)
    dec = decoder(Request, Response)
    req = Request(url="http://www.yandex.ru",
                  method=b'GET',
                  meta={
                      b'test': b'shmest',
                      b'scrapy_meta': {
                          'rule': 0,
                          'key': 'value'
                      }
                  },
                  headers={b'reqhdr': b'value'})
    req2 = Request(url="http://www.yandex.ru/search")
    msgs = [
        enc.encode_add_seeds([req]),
        enc.encode_page_crawled(
            Response(url="http://www.yandex.ru",
                     body=b'SOME CONTENT',
                     headers={b'hdr': b'value'},
                     request=req)),
        enc.encode_links_extracted(req, [req2]),
        enc.encode_request_error(req, "Host not found"),
        enc.encode_update_score(req, 0.51, True),
        enc.encode_new_job_id(1),
        enc.encode_offset(0, 28796),
        enc.encode_request(req),
        invalid_value,
    ]

    it = iter(msgs)

    o = dec.decode(next(it))
    assert o[0] == 'add_seeds'
    assert type(o[1]) == list
    req_d = o[1][0]
    check_request(req_d, req)
    assert type(req_d) == Request

    o = dec.decode(next(it))
    assert o[0] == 'page_crawled'
    assert type(o[1]) == Response
    assert o[1].url == req.url and o[1].meta == req.meta
    if send_body:
        o[1].body == b'SOME CONTENT'
    else:
        o[1].body is None

    o = dec.decode(next(it))
    print(o)
    assert o[0] == 'links_extracted'
    assert type(o[1]) == Request
    assert o[1].url == req.url and o[1].meta == req.meta
    assert type(o[2]) == list
    req_d = o[2][0]
    assert type(req_d) == Request
    assert req_d.url == req2.url

    o_type, o_req, o_error = dec.decode(next(it))
    assert o_type == 'request_error'
    check_request(o_req, req)
    assert o_error == "Host not found"

    o_type, o_req2, score, schedule = dec.decode(next(it))
    assert o_type == 'update_score'
    assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers
    assert score == 0.51
    assert schedule is True

    o_type, job_id = dec.decode(next(it))
    assert o_type == 'new_job_id'
    assert job_id == 1

    o_type, partition_id, offset = dec.decode(next(it))
    assert o_type == 'offset'
    assert partition_id == 0
    assert offset == 28796

    o = dec.decode_request(next(it))
    check_request(o, req)

    with pytest.raises(TypeError):
        dec.decode(next(it))
Пример #15
0
def test_codec(encoder, decoder):
    def check_request(req1, req2):
        assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \
            and req1.method == req2.method

    enc = encoder(Request, send_body=True)
    dec = decoder(Request, Response)
    req = Request(url="http://www.yandex.ru",
                  method=b'GET',
                  meta={b"test": b"shmest"},
                  headers={b'reqhdr': b'value'})
    req2 = Request(url="http://www.yandex.ru/search")
    msgs = [
        enc.encode_add_seeds([req]),
        enc.encode_page_crawled(
            Response(url="http://www.yandex.ru",
                     body=b'SOME CONTENT',
                     headers={b'hdr': b'value'},
                     request=req)),
        enc.encode_links_extracted(req, [req2]),
        enc.encode_request_error(req, "Host not found"),
        enc.encode_update_score(req, 0.51, True),
        enc.encode_new_job_id(1),
        enc.encode_offset(0, 28796),
        enc.encode_request(req)
    ]

    it = iter(msgs)

    o = dec.decode(next(it))
    assert o[0] == 'add_seeds'
    assert type(o[1]) == list
    req_d = o[1][0]
    check_request(req_d, req)
    assert type(req_d) == Request

    o = dec.decode(next(it))
    assert o[0] == 'page_crawled'
    assert type(o[1]) == Response
    assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[
        1].meta == req.meta

    o = dec.decode(next(it))
    print(o)
    assert o[0] == 'links_extracted'
    assert type(o[1]) == Request
    assert o[1].url == req.url and o[1].meta == req.meta
    assert type(o[2]) == list
    req_d = o[2][0]
    assert type(req_d) == Request
    assert req_d.url == req2.url

    o_type, o_req, o_error = dec.decode(next(it))
    assert o_type == 'request_error'
    check_request(o_req, req)
    assert o_error == "Host not found"

    o_type, o_req2, score, schedule = dec.decode(next(it))
    assert o_type == 'update_score'
    assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers
    assert score == 0.51
    assert schedule is True

    o_type, job_id = dec.decode(next(it))
    assert o_type == 'new_job_id'
    assert job_id == 1

    o_type, partition_id, offset = dec.decode(next(it))
    assert o_type == 'offset'
    assert partition_id == 0
    assert offset == 28796

    o = dec.decode_request(next(it))
    check_request(o, req)
Пример #16
0
def test_codec(encoder, decoder):
    def check_request(req1, req2):
        assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers

    enc = encoder(Request, send_body=True)
    dec = decoder(Request, Response)
    req = Request(url="http://www.yandex.ru", meta={"test": "shmest"}, headers={'reqhdr': 'value'})
    req2 = Request(url="http://www.yandex.ru/search")
    msgs = [
        enc.encode_add_seeds([req]),
        enc.encode_page_crawled(Response(url="http://www.yandex.ru", body='SOME CONTENT', headers={'hdr': 'value'},
                                         request=req), [req2]),
        enc.encode_request_error(req, "Host not found"),
        enc.encode_update_score("1be68ff556fd0bbe5802d1a100850da29f7f15b1", 0.51, "http://yandex.ru", True),
        enc.encode_new_job_id(1),
        enc.encode_offset(0, 28796),
        enc.encode_request(req)
    ]

    it = iter(msgs)

    o = dec.decode(it.next())
    assert o[0] == 'add_seeds'
    assert type(o[1]) == list
    req_d = o[1][0]
    check_request(req_d, req)
    assert type(req_d) == Request

    o = dec.decode(it.next())
    assert o[0] == 'page_crawled'
    assert type(o[1]) == Response
    assert o[1].url == req.url and o[1].body == 'SOME CONTENT' and o[1].meta == req.meta

    assert type(o[2]) == list
    req_d = o[2][0]
    assert type(req_d) == Request
    assert req_d.url == req2.url

    o_type, o_req, o_error = dec.decode(it.next())
    assert o_type == 'request_error'
    check_request(o_req, req)
    assert o_error == "Host not found"

    o_type, fprint, score, url, schedule = dec.decode(it.next())
    assert o_type == 'update_score'
    assert fprint == "1be68ff556fd0bbe5802d1a100850da29f7f15b1"
    assert score == 0.51
    assert url == "http://yandex.ru"
    assert schedule is True

    o_type, job_id = dec.decode(it.next())
    assert o_type == 'new_job_id'
    assert job_id == 1

    o_type, partition_id, offset = dec.decode(it.next())
    assert o_type == 'offset'
    assert partition_id == 0
    assert offset == 28796

    o = dec.decode_request(it.next())
    check_request(o, req)