def test_policy_constructor_blank_name(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': '', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } with pytest.raises(PolicyValidationError): policy = Policy(doc, version='1.0.0', seeds=[])
def make_policy(proxy=None): ''' Make a sample policy. ''' dt = datetime(2018, 12, 31, 13, 47, 00) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': dt, 'updated_at': dt, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': proxy or [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(doc, '1.0.0', ['https://seeds.example'])
def make_policy(): created_at = datetime(2018, 12, 31, 13, 47, 00) policy_doc = { 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'name': 'Test', 'created_at': created_at, 'updated_at': created_at, 'authentication': { 'enabled': True, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['b'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(policy_doc, '1.0.0', ['https://frontier.example'])
def make_policy(usage, user_agent): ''' Make a sample policy. ''' dt = datetime(2018, 12, 31, 13, 47, 00) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': dt, 'updated_at': dt, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'save': True }, ], 'proxy_rules': [], 'robots_txt': { 'usage': usage, }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': user_agent }] } return Policy(doc, '1.0.0', ['https://seeds.example'])
def test_policy_constructor_captcha(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'captcha_solver': { 'id': b'captcha1', 'name': 'CAPTCHA Solver 1', 'service_url': 'https://solver.example', 'api_key': 'test-key', 'require_phrase': False, 'case_sensitive': False, 'characters': 'abcdefg', 'require_math': False, 'min_length': 6, 'max_length': 6, }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy = Policy(doc, version='1.0.0', seeds=[]) assert isinstance(policy.captcha_solver, CaptchaSolver)
def test_policy_constructor(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy = Policy(doc, version='1.0.0', seeds=[]) assert isinstance(policy.authentication, PolicyAuthentication) assert policy.captcha_solver is None assert isinstance(policy.limits, PolicyLimits) assert isinstance(policy.mime_type_rules, PolicyMimeTypeRules) assert isinstance(policy.proxy_rules, PolicyProxyRules) assert isinstance(policy.robots_txt, PolicyRobotsTxt) assert isinstance(policy.url_normalization, PolicyUrlNormalization) assert isinstance(policy.url_rules, PolicyUrlRules) assert isinstance(policy.user_agents, PolicyUserAgents)
def test_policy_replace_mime_rules(): created_at = datetime.now(timezone.utc) updated_at = datetime.now(timezone.utc) + timedelta(minutes=1) doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': created_at, 'updated_at': updated_at, 'authentication': { 'enabled': True }, 'limits': { 'max_cost': 10, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [ { 'proxy_url': 'socks5://localhost:1234' }, ], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['PHPSESSID'], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } policy1 = Policy(doc, version='1.0.0', seeds=[]) policy2 = policy1.replace_mime_type_rules([ { 'match': 'MATCHES', 'pattern': '^application/', 'save': True }, { 'save': False }, ]) # These properties are all the same: assert policy1.authentication is policy2.authentication assert policy1.captcha_solver is policy2.captcha_solver assert policy1.limits is policy2.limits assert policy1.proxy_rules is policy2.proxy_rules assert policy1.robots_txt is policy2.robots_txt assert policy1.url_normalization is policy2.url_normalization assert policy1.url_rules is policy2.url_rules assert policy1.user_agents is policy2.user_agents # The MIME type rules are different: assert policy1.mime_type_rules is not policy2.mime_type_rules assert policy1.mime_type_rules.should_save('text/plain') assert not policy1.mime_type_rules.should_save('application/json') assert not policy2.mime_type_rules.should_save('text/plain') assert policy2.mime_type_rules.should_save('application/json')
async def test_crawl_extractor(nursery): # Create test fixtures. job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' db = Mock() db.delete_frontier_item = AsyncMock() db.insert_frontier_items = AsyncMock() to_extractor, extractor_recv = trio.open_memory_channel(0) extractor_send, from_extractor = trio.open_memory_channel(0) created_at = datetime(2018,12,31,13,47,00) policy_doc = { 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'name': 'Test', 'created_at': created_at, 'updated_at': created_at, 'authentication': { 'enabled': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, {'save': False}, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': ['b'], }, 'url_rules': [ {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/'}, {'action': 'MULTIPLY', 'amount': 0}, ], 'user_agents': [ {'name': 'Test User Agent'} ] } policy = Policy(policy_doc, '1.0.0', ['https://extractor.example']) downloader = Mock() robots_txt_manager = Mock() robots_txt_manager.is_allowed = AsyncMock(return_value=True) old_urls = {b'\xd2\x1b\x9b(p-\xed\xb2\x10\xdf\xf0\xa8\xe1\xa2*<'} stats_dict = {'frontier_size': 0} extractor = CrawlExtractor(job_id, db, extractor_send, extractor_recv, policy, downloader, robots_txt_manager, old_urls, stats_dict, batch_size=3) assert repr(extractor) == '<CrawlExtractor job_id=aaaaaaaa>' nursery.start_soon(extractor.run) # The HTML document has 5 valid links (enough to create two batches when the # `insert_batch` is set to 3) as well as 1 link that's out of domain (should # not be added to frontier) and 1 link that's in `old_urls` (also should not # be added to frontier). html_body = \ b'''<!DOCTYPE html> <html> <head><meta charset="UTF-8"><title>Test</title></head> <body> <a href='http://extractor.example/alpha'>Alpha</a> <a href='http://extractor.example/bravo'>Bravo</a> <a href='http://extractor.example/charlie'>Charlie</a> <a href='http://invalid.example/'>Invalid</a> <a href='http://extractor.example/delta'>Delta</a> <a href='http://extractor.example/echo'>Echo</a> <a href='http://extractor.example/old-url'>Echo</a> </body> </html>''' response = DownloadResponse( frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', cost=1.0, url='https://extractor.example', canonical_url='https://extractor.example', content_type='text/html', body=html_body, started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), exception=None, status_code=200, headers=dict() ) await to_extractor.send(response) await from_extractor.receive() # The item should be deleted from the frontier: assert db.delete_frontier_item.call_count == 1 assert db.delete_frontier_item.call_args[0] == \ 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb' # The insert function should be called twice: once with three items # (alpha, bravo charlie), and once with two items (delta, echo). assert db.insert_frontier_items.call_count == 2 assert len(db.insert_frontier_items.call_args[0]) == 2 assert stats_dict['frontier_size'] == 5 assert robots_txt_manager.is_allowed.call_count == 6
def make_policy(captcha_port=80): policy_doc = { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 'name': 'Test', 'created_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 'updated_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 'authentication': { 'enabled': False, }, 'captcha_solver': { 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8', 'name': 'Example CAPTCHA', 'service_url': 'http://127.0.0.1:{}'.format(captcha_port), 'api_key': None, 'require_phrase': False, 'case_sensitive': True, 'characters': 'ALPHANUMERIC', 'require_math': False, }, 'limits': { 'max_cost': 10, 'max_duration': 3600, 'max_items': 10_000, }, 'mime_type_rules': [ { 'match': 'MATCHES', 'pattern': '^text/', 'save': True }, { 'save': False }, ], 'proxy_rules': [], 'robots_txt': { 'usage': 'IGNORE', }, 'url_normalization': { 'enabled': True, 'strip_parameters': [], }, 'url_rules': [ { 'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 'pattern': '^https?://({SEED_DOMAINS})/' }, { 'action': 'MULTIPLY', 'amount': 0 }, ], 'user_agents': [{ 'name': 'Test User Agent' }] } return Policy(policy_doc, '1.0.0', ['https://login.example'])