def single_node_chain(url1, url2): r = Request(url=url1) re = Response(url=url2, request=r) re.meta[b'fingerprint'] = sha1(url2) re.meta[b'redirect_urls'] = [url1] re.meta[b'redirect_fingerprints'] = [sha1(url1)] return re
def test_basic(): cs = Basic() r = Request(url="http://www.scrapinghub.com/") re = Response(url="http://scrapinghub.com/", request=r) re.meta["fingerprint"] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e" re.meta["redirect_urls"] = ["http://www.scrapinghub.com/"] re.meta["redirect_fingerprints"] = ["6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66"] cs.page_crawled(re, []) assert re.url == "http://www.scrapinghub.com/"
def test_basic(): cs = Basic() r = Request(url="http://www.scrapinghub.com/") re = Response(url="http://scrapinghub.com/", request=r) re.meta['fingerprint'] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e" re.meta['redirect_urls'] = ['http://www.scrapinghub.com/'] re.meta['redirect_fingerprints'] = [ "6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66" ] cs.page_crawled(re, []) assert re.url == "http://www.scrapinghub.com/"
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_page_crawled(self): mbb = self.mbb_setup() resp = Response(r1.url, body='body', request=r1) mbb.page_crawled(resp) page = mbb._decoder.decode(mbb.spider_log_producer.messages[0])[1] self.assertEqual((page.request.url, page.body), (resp.request.url, 'body'))
def test_page_crawled(self): dbw = self.dbw_setup() resp = Response(r1.url, request=r1) msg = dbw._encoder.encode_page_crawled(resp) dbw.spider_log_consumer.put_messages([msg]) dbw.consume_incoming() assert set([r.url for r in dbw._backend.responses]) == set([r1.url])
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_page_crawled(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.page_crawled(response) assert fm.backend.responses.pop() == response assert [mw.responses.pop() for mw in fm.middlewares] == [response] * 4 assert fm.canonicalsolver.responses.pop() == response assert response.meta[b'test_response'] == 'test'
def test_page_crawled(self): dbw = self.dbw_setup() resp = Response(r1.url, request=r1) msg = dbw._encoder.encode_page_crawled(resp) incoming_consumer = dbw.slot.components[IncomingConsumer] incoming_consumer.spider_log_consumer.put_messages([msg]) incoming_consumer.run() assert set([r.url for r in incoming_consumer.backend.responses ]) == set([r1.url])
def test_links_extracted(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.links_extracted(r1, links=[r2, r3]) assert set([link for link in fm.backend.links]) == set([r2, r3]) assert set([link for link in fm.canonicalsolver.links]) == set([r2, r3]) assert [set([link for link in mw.links]) for mw in fm.middlewares] == [set([r2, r3])] * 4 assert [link.meta[b'test_links'] for link in [r2, r3]] == ['test'] * 2 assert [ link.meta[b'test_links_canonical_solver'] for link in [r2, r3] ] == ['test'] * 2
def test_metadata(self): connection = Connection(host='hbase-docker', port=9090) metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) metadata.add_seeds([r1, r2, r3]) resp = Response('https://www.example.com', request=r1) metadata.page_crawled(resp) metadata.links_extracted(resp.request, [r2, r3]) metadata.request_error(r4, 'error') metadata.frontier_stop() table = connection.table('metadata') assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \ set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12'])
def test_page_crawled(self): sw = self.sw r1.meta[b'jid'] = 1 resp = Response(r1.url, request=r1) msg = sw._encoder.encode_page_crawled(resp) sw.consumer.put_messages([msg]) sw.work() # response should be skipped if it's jid doesn't match the strategy worker's assert sw.scoring_log_producer.messages == [] sw.workflow.job_id = 1 sw.consumer.put_messages([msg]) sw.work() r1c = r1.copy() sw.workflow.states_context.states.set_states(r1c) assert r1c.meta[b'state'] == States.CRAWLED
def test_codec(encoder, decoder, send_body, invalid_value): def check_request(req1, req2): assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru", method=b'GET', meta={ b'test': b'shmest', b'scrapy_meta': { 'rule': 0, 'key': 'value' } }, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled( Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req), invalid_value, ] it = iter(msgs) o = dec.decode(next(it)) assert o[0] == 'add_seeds' assert type(o[1]) == list req_d = o[1][0] check_request(req_d, req) assert type(req_d) == Request o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response assert o[1].url == req.url and o[1].meta == req.meta if send_body: o[1].body == b'SOME CONTENT' else: o[1].body is None o = dec.decode(next(it)) print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta assert type(o[2]) == list req_d = o[2][0] assert type(req_d) == Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(next(it)) assert o_type == 'request_error' check_request(o_req, req) assert o_error == "Host not found" o_type, o_req2, score, schedule = dec.decode(next(it)) assert o_type == 'update_score' assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers assert score == 0.51 assert schedule is True o_type, job_id = dec.decode(next(it)) assert o_type == 'new_job_id' assert job_id == 1 o_type, partition_id, offset = dec.decode(next(it)) assert o_type == 'offset' assert partition_id == 0 assert offset == 28796 o = dec.decode_request(next(it)) check_request(o, req) with pytest.raises(TypeError): dec.decode(next(it))
def test_codec(encoder, decoder): def check_request(req1, req2): assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \ and req1.method == req2.method enc = encoder(Request, send_body=True) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru", method=b'GET', meta={b"test": b"shmest"}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled( Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req) ] it = iter(msgs) o = dec.decode(next(it)) assert o[0] == 'add_seeds' assert type(o[1]) == list req_d = o[1][0] check_request(req_d, req) assert type(req_d) == Request o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[ 1].meta == req.meta o = dec.decode(next(it)) print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta assert type(o[2]) == list req_d = o[2][0] assert type(req_d) == Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(next(it)) assert o_type == 'request_error' check_request(o_req, req) assert o_error == "Host not found" o_type, o_req2, score, schedule = dec.decode(next(it)) assert o_type == 'update_score' assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers assert score == 0.51 assert schedule is True o_type, job_id = dec.decode(next(it)) assert o_type == 'new_job_id' assert job_id == 1 o_type, partition_id, offset = dec.decode(next(it)) assert o_type == 'offset' assert partition_id == 0 assert offset == 28796 o = dec.decode_request(next(it)) check_request(o, req)
def test_codec(encoder, decoder): def check_request(req1, req2): assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers enc = encoder(Request, send_body=True) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru", meta={"test": "shmest"}, headers={'reqhdr': 'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled(Response(url="http://www.yandex.ru", body='SOME CONTENT', headers={'hdr': 'value'}, request=req), [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score("1be68ff556fd0bbe5802d1a100850da29f7f15b1", 0.51, "http://yandex.ru", True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req) ] it = iter(msgs) o = dec.decode(it.next()) assert o[0] == 'add_seeds' assert type(o[1]) == list req_d = o[1][0] check_request(req_d, req) assert type(req_d) == Request o = dec.decode(it.next()) assert o[0] == 'page_crawled' assert type(o[1]) == Response assert o[1].url == req.url and o[1].body == 'SOME CONTENT' and o[1].meta == req.meta assert type(o[2]) == list req_d = o[2][0] assert type(req_d) == Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(it.next()) assert o_type == 'request_error' check_request(o_req, req) assert o_error == "Host not found" o_type, fprint, score, url, schedule = dec.decode(it.next()) assert o_type == 'update_score' assert fprint == "1be68ff556fd0bbe5802d1a100850da29f7f15b1" assert score == 0.51 assert url == "http://yandex.ru" assert schedule is True o_type, job_id = dec.decode(it.next()) assert o_type == 'new_job_id' assert job_id == 1 o_type, partition_id, offset = dec.decode(it.next()) assert o_type == 'offset' assert partition_id == 0 assert offset == 28796 o = dec.decode_request(it.next()) check_request(o, req)