def test_policy_constructor_blank_name(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': '', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } with pytest.raises(PolicyValidationError): policy = Policy(doc, version='1.0.0', seeds=[])
def make_policy(proxy=None): ''' Make a sample policy. ''' dt = datetime(2018, 12, 31, 13, 47, 00) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': dt, 'updated_at': dt, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': proxy or [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(doc, '1.0.0', ['https://seeds.example'])
def make_policy(): created_at = datetime(2018, 12, 31, 13, 47, 00) policy_doc = { 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'name': 'Test', 'created_at': created_at, 'updated_at': created_at, 'authentication': { 'enabled': True, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['b'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(policy_doc, '1.0.0', ['https://frontier.example'])
def test_convert_policy_pb_to_doc_captcha(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) pb = starbelly.starbelly_pb2.Policy() pb.policy_id = \ b'\x01\xb6\x0e\xeb*\xc9OA\x9b\x0cG\xdc\xbc\xf67\xf7' pb.name = 'Test' pb.created_at = created_at.isoformat() pb.updated_at = updated_at.isoformat() pb.captcha_solver_id = \ b'\xe2\x72\x23\xd3\x85\xef\x4e\x89\x8f\xc8\xdb\xcf\x8d\xf0\xce\x97' doc = Policy.convert_pb_to_doc(pb) assert doc['captcha_solver_id'] == 'e27223d3-85ef-4e89-8fc8-dbcf8df0ce97'
def make_policy(usage, user_agent): ''' Make a sample policy. ''' dt = datetime(2018, 12, 31, 13, 47, 00) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': dt, 'updated_at': dt, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'save': True }, ], 'proxy_rules': [], 'robots_txt': { 'usage': usage, }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': user_agent }] } return Policy(doc, '1.0.0', ['https://seeds.example'])
def test_policy_constructor_captcha(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'captcha_solver': { 'id': b'captcha1', 'name': 'CAPTCHA Solver 1', 'service_url': 'https://solver.example', 'api_key': 'test-key', 'require_phrase': False, 'case_sensitive': False, 'characters': 'abcdefg', 'require_math': False, 'min_length': 6, 'max_length': 6, }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy = Policy(doc, version='1.0.0', seeds=[]) assert isinstance(policy.captcha_solver, CaptchaSolver)
def test_policy_constructor(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy = Policy(doc, version='1.0.0', seeds=[]) assert isinstance(policy.authentication, PolicyAuthentication) assert policy.captcha_solver is None assert isinstance(policy.limits, PolicyLimits) assert isinstance(policy.mime_type_rules, PolicyMimeTypeRules) assert isinstance(policy.proxy_rules, PolicyProxyRules) assert isinstance(policy.robots_txt, PolicyRobotsTxt) assert isinstance(policy.url_normalization, PolicyUrlNormalization) assert isinstance(policy.url_rules, PolicyUrlRules) assert isinstance(policy.user_agents, PolicyUserAgents)
def test_convert_policy_doc_to_pb(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234', 'pattern': r'\.onion', 'match': 'MATCHES' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } pb = starbelly.starbelly_pb2.Policy() Policy.convert_doc_to_pb(doc, pb) assert pb.policy_id == b'\x01\xb6\x0e\xeb*\xc9OA\x9b\x0cG\xdc\xbc\xf67\xf7' assert pb.name == 'Test' assert pb.created_at == created_at.isoformat() assert pb.updated_at == updated_at.isoformat() # Authentication assert pb.authentication.enabled # Limits assert pb.limits.max_cost == 10 # MIME type rules assert len(pb.mime_type_rules) == 2 assert pb.mime_type_rules[0].match == MATCH_ENUM.Value('MATCHES') assert pb.mime_type_rules[0].pattern, '^text/' assert pb.mime_type_rules[0].save assert not pb.mime_type_rules[1].save # Proxy rules assert len(pb.proxy_rules) == 1 assert pb.proxy_rules[0].proxy_url == 'socks5://localhost:1234' # Robots.txt assert pb.robots_txt.usage == USAGE_ENUM.Value('IGNORE') # URL normalization assert pb.url_normalization.enabled assert pb.url_normalization.strip_parameters == ['PHPSESSID'] # URL rules assert len(pb.url_rules) == 2 assert pb.url_rules[0].action == ACTION_ENUM.Value('ADD') assert pb.url_rules[0].amount == 1 assert pb.url_rules[0].match == MATCH_ENUM.Value('MATCHES') assert pb.url_rules[0].pattern == '^https?://({SEED_DOMAINS})/' assert pb.url_rules[1].action == ACTION_ENUM.Value('MULTIPLY') assert pb.url_rules[1].amount == 0 # User agents assert len(pb.user_agents) == 1 assert pb.user_agents[0].name == 'Test User Agent'
def test_convert_policy_pb_to_doc(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) pb = starbelly.starbelly_pb2.Policy() pb.policy_id = \ b'\x01\xb6\x0e\xeb*\xc9OA\x9b\x0cG\xdc\xbc\xf67\xf7' pb.name = 'Test' pb.created_at = created_at.isoformat() pb.updated_at = updated_at.isoformat() # Authentication pb.authentication.enabled = True # Limits pb.limits.max_cost = 10 pb.limits.max_duration = 3600 pb.limits.max_items = 10_000 # MIME type rules mime1 = pb.mime_type_rules.add() mime1.match = MATCH_ENUM.Value('MATCHES') mime1.pattern = '^text/' mime1.save = True mime2 = pb.mime_type_rules.add() mime2.save = False # Proxy rules proxy1 = pb.proxy_rules.add() proxy1.proxy_url = 'socks5://localhost:1234' proxy1.pattern = r'\.onion' proxy1.match = MATCH_ENUM.Value('MATCHES') # Robots.txt pb.robots_txt.usage = USAGE_ENUM.Value('IGNORE') # URL normalization pb.url_normalization.enabled = True pb.url_normalization.strip_parameters.append('PHPSESSID') # URL rules url1 = pb.url_rules.add() url1.action = ACTION_ENUM.Value('ADD') url1.amount = 1 url1.match = MATCH_ENUM.Value('MATCHES') url1.pattern = '^https?://({SEED_DOMAINS})/' url2 = pb.url_rules.add() url2.action = ACTION_ENUM.Value('MULTIPLY') url2.amount = 0 # User agents agent1 = pb.user_agents.add() agent1.name = 'Test User Agent' doc = Policy.convert_pb_to_doc(pb) assert doc['id'] == '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7' assert doc['name'] == 'Test' assert doc['created_at'] == created_at assert doc['updated_at'] == updated_at # Authentication assert doc['authentication']['enabled'] # Limits assert doc['limits']['max_cost'] == 10 # MIME type rules assert len(doc['mime_type_rules']) == 2 mime1 = doc['mime_type_rules'][0] mime2 = doc['mime_type_rules'][1] assert mime1['match'] == 'MATCHES' assert mime1['pattern'] == '^text/' assert mime1['save'] assert not mime2['save'] # Proxy rules assert len(doc['proxy_rules']) == 1 proxy1 = doc['proxy_rules'][0] assert proxy1['proxy_url'] == 'socks5://localhost:1234' # Robots.txt assert doc['robots_txt']['usage'] == 'IGNORE' # URL normalization assert doc['url_normalization']['enabled'] assert doc['url_normalization']['strip_parameters'] == ['PHPSESSID'] # URL rules assert len(doc['url_rules']) == 2 url1 = doc['url_rules'][0] url2 = doc['url_rules'][1] assert url1['action'] == 'ADD' assert url1['amount'] == 1 assert url1['match'] == 'MATCHES' assert url1['pattern'] == '^https?://({SEED_DOMAINS})/' assert url2['action'] == 'MULTIPLY' assert url2['amount'] == 0 # User agents assert len(doc['user_agents']) == 1 agent1 = doc['user_agents'][0] assert agent1['name'] == 'Test User Agent'
def test_convert_policy_doc_to_pb_captcha(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'captcha_solver_id': 'e27223d3-85ef-4e89-8fc8-dbcf8df0ce97', 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } pb = starbelly.starbelly_pb2.Policy() Policy.convert_doc_to_pb(doc, pb) assert pb.captcha_solver_id == \ b'\xe2\x72\x23\xd3\x85\xef\x4e\x89\x8f\xc8\xdb\xcf\x8d\xf0\xce\x97'
def test_policy_replace_mime_rules(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy1 = Policy(doc, version='1.0.0', seeds=[]) policy2 = policy1.replace_mime_type_rules([ { 'match': 'MATCHES', 'pattern': '^application/', 'save': True }, { 'save': False }, ]) # These properties are all the same: assert policy1.authentication is policy2.authentication assert policy1.captcha_solver is policy2.captcha_solver assert policy1.limits is policy2.limits assert policy1.proxy_rules is policy2.proxy_rules assert policy1.robots_txt is policy2.robots_txt assert policy1.url_normalization is policy2.url_normalization assert policy1.url_rules is policy2.url_rules assert policy1.user_agents is policy2.user_agents # The MIME type rules are different: assert policy1.mime_type_rules is not policy2.mime_type_rules assert policy1.mime_type_rules.should_save('text/plain') assert not policy1.mime_type_rules.should_save('application/json') assert not policy2.mime_type_rules.should_save('text/plain') assert policy2.mime_type_rules.should_save('application/json')
async def test_crawl_extractor(nursery): # Create test fixtures. job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' db = Mock() db.delete_frontier_item = AsyncMock() db.insert_frontier_items = AsyncMock() to_extractor, extractor_recv = trio.open_memory_channel(0) extractor_send, from_extractor = trio.open_memory_channel(0) created_at = datetime(2018,12,31,13,47,00) policy_doc = { 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'name': 'Test', 'created_at': created_at, 'updated_at': created_at, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, {'save': False}, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['b'], }, 'url_rules': [ {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/'}, {'action': 'MULTIPLY', 'amount': 0}, ], 'user_agents': [ {'name': 'Test User Agent'} ] } policy = Policy(policy_doc, '1.0.0', ['https://extractor.example']) downloader = Mock() robots_txt_manager = Mock() robots_txt_manager.is_allowed = AsyncMock(return_value=True) old_urls = {b'\xd2\x1b\x9b(p-\xed\xb2\x10\xdf\xf0\xa8\xe1\xa2*<'} stats_dict = {'frontier_size': 0} extractor = CrawlExtractor(job_id, db, extractor_send, extractor_recv, policy, downloader, robots_txt_manager, old_urls, stats_dict, batch_size=3) assert repr(extractor) == '<CrawlExtractor job_id=aaaaaaaa>' nursery.start_soon(extractor.run) # The HTML document has 5 valid links (enough to create two batches when the # `insert_batch` is set to 3) as well as 1 link that's out of domain (should # not be added to frontier) and 1 link that's in `old_urls` (also should not # be added to frontier). html_body = \ b'''<!DOCTYPE html> <html> <head><meta charset="UTF-8"><title>Test</title></head> <body> <a href='http://extractor.example/alpha'>Alpha</a> <a href='http://extractor.example/bravo'>Bravo</a> <a href='http://extractor.example/charlie'>Charlie</a> <a href='http://invalid.example/'>Invalid</a> <a href='http://extractor.example/delta'>Delta</a> <a href='http://extractor.example/echo'>Echo</a> <a href='http://extractor.example/old-url'>Echo</a> </body> </html>''' response = DownloadResponse( frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', cost=1.0, url='https://extractor.example', canonical_url='https://extractor.example', content_type='text/html', body=html_body, started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), exception=None, status_code=200, headers=dict() ) await to_extractor.send(response) await from_extractor.receive() # The item should be deleted from the frontier: assert db.delete_frontier_item.call_count == 1 assert db.delete_frontier_item.call_args[0] == \ 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb' # The insert function should be called twice: once with three items # (alpha, bravo charlie), and once with two items (delta, echo). assert db.insert_frontier_items.call_count == 2 assert len(db.insert_frontier_items.call_args[0]) == 2 assert stats_dict['frontier_size'] == 5 assert robots_txt_manager.is_allowed.call_count == 6
assert server_db.get_policy.call_args[0] == policy_id assert response1.policy.name == 'Test Policy' # List policies command2 = new_request(2) command2.list_policies.page.limit = 10 command2.list_policies.page.offset = 0 response2 = await send_test_command(client, command2) assert response2.list_policies.total == 1 assert response2.list_policies.policies[0].name == 'Test Policy' # Set policy command3 = new_request(3) policy2_doc = policy_doc.copy() del policy2_doc['id'] Policy.convert_doc_to_pb(policy2_doc, command3.set_policy.policy) response3 = await send_test_command(client, command3) assert response3.new_policy.policy_id == b'\xaa' * 16 # Delete policy command4 = new_request(4) command4.delete_policy.policy_id = b'\xaa' * 16 response4 = await send_test_command(client, command4) assert response4.is_success assert server_db.delete_policy.call_args[0] == policy_id @fail_after(3) async def test_list_rate_limits(client, server_db): token = b'\xaa' * 16 rate_limit = {
def make_policy(captcha_port=80): policy_doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 'updated_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 'authentication': { 'enabled': False, }, 'captcha_solver': { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8', 'name': 'Example CAPTCHA', 'service_url': 'http://127.0.0.1:{}'.format(captcha_port), 'api_key': None, 'require_phrase': False, 'case_sensitive': True, 'characters': 'ALPHANUMERIC', 'require_math': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(policy_doc, '1.0.0', ['https://login.example'])