Пример #1
0
def test_policy_constructor_blank_name():
    created_at = datetime.now(timezone.utc)
    updated_at = datetime.now(timezone.utc) + timedelta(minutes=1)
    doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        '',
        'created_at':
        created_at,
        'updated_at':
        updated_at,
        'authentication': {
            'enabled': True
        },
        'limits': {
            'max_cost': 10,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [
            {
                'proxy_url': 'socks5://localhost:1234'
            },
        ],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['PHPSESSID'],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    with pytest.raises(PolicyValidationError):
        policy = Policy(doc, version='1.0.0', seeds=[])
Пример #2
0
def make_policy(proxy=None):
    ''' Make a sample policy. '''
    dt = datetime(2018, 12, 31, 13, 47, 00)
    doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        'Test',
        'created_at':
        dt,
        'updated_at':
        dt,
        'authentication': {
            'enabled': False,
        },
        'limits': {
            'max_cost': 10,
            'max_duration': 3600,
            'max_items': 10_000,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules':
        proxy or [],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': [],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    return Policy(doc, '1.0.0', ['https://seeds.example'])
Пример #3
0
def make_policy():
    created_at = datetime(2018, 12, 31, 13, 47, 00)
    policy_doc = {
        'id':
        'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
        'name':
        'Test',
        'created_at':
        created_at,
        'updated_at':
        created_at,
        'authentication': {
            'enabled': True,
        },
        'limits': {
            'max_cost': 10,
            'max_duration': 3600,
            'max_items': 10_000,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['b'],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    return Policy(policy_doc, '1.0.0', ['https://frontier.example'])
Пример #4
0
def make_policy(usage, user_agent):
    ''' Make a sample policy. '''
    dt = datetime(2018, 12, 31, 13, 47, 00)
    doc = {
        'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name': 'Test',
        'created_at': dt,
        'updated_at': dt,
        'authentication': {
            'enabled': False,
        },
        'limits': {
            'max_cost': 10,
        },
        'mime_type_rules': [
            {
                'save': True
            },
        ],
        'proxy_rules': [],
        'robots_txt': {
            'usage': usage,
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': [],
        },
        'url_rules': [
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': user_agent
        }]
    }
    return Policy(doc, '1.0.0', ['https://seeds.example'])
Пример #5
0
def test_policy_constructor_captcha():
    created_at = datetime.now(timezone.utc)
    updated_at = datetime.now(timezone.utc) + timedelta(minutes=1)
    doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        'Test',
        'created_at':
        created_at,
        'updated_at':
        updated_at,
        'authentication': {
            'enabled': True
        },
        'captcha_solver': {
            'id': b'captcha1',
            'name': 'CAPTCHA Solver 1',
            'service_url': 'https://solver.example',
            'api_key': 'test-key',
            'require_phrase': False,
            'case_sensitive': False,
            'characters': 'abcdefg',
            'require_math': False,
            'min_length': 6,
            'max_length': 6,
        },
        'limits': {
            'max_cost': 10,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [
            {
                'proxy_url': 'socks5://localhost:1234'
            },
        ],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['PHPSESSID'],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    policy = Policy(doc, version='1.0.0', seeds=[])
    assert isinstance(policy.captcha_solver, CaptchaSolver)
Пример #6
0
def test_policy_constructor():
    created_at = datetime.now(timezone.utc)
    updated_at = datetime.now(timezone.utc) + timedelta(minutes=1)
    doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        'Test',
        'created_at':
        created_at,
        'updated_at':
        updated_at,
        'authentication': {
            'enabled': True
        },
        'limits': {
            'max_cost': 10,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [
            {
                'proxy_url': 'socks5://localhost:1234'
            },
        ],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['PHPSESSID'],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    policy = Policy(doc, version='1.0.0', seeds=[])
    assert isinstance(policy.authentication, PolicyAuthentication)
    assert policy.captcha_solver is None
    assert isinstance(policy.limits, PolicyLimits)
    assert isinstance(policy.mime_type_rules, PolicyMimeTypeRules)
    assert isinstance(policy.proxy_rules, PolicyProxyRules)
    assert isinstance(policy.robots_txt, PolicyRobotsTxt)
    assert isinstance(policy.url_normalization, PolicyUrlNormalization)
    assert isinstance(policy.url_rules, PolicyUrlRules)
    assert isinstance(policy.user_agents, PolicyUserAgents)
Пример #7
0
def test_policy_replace_mime_rules():
    created_at = datetime.now(timezone.utc)
    updated_at = datetime.now(timezone.utc) + timedelta(minutes=1)
    doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        'Test',
        'created_at':
        created_at,
        'updated_at':
        updated_at,
        'authentication': {
            'enabled': True
        },
        'limits': {
            'max_cost': 10,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [
            {
                'proxy_url': 'socks5://localhost:1234'
            },
        ],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['PHPSESSID'],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    policy1 = Policy(doc, version='1.0.0', seeds=[])
    policy2 = policy1.replace_mime_type_rules([
        {
            'match': 'MATCHES',
            'pattern': '^application/',
            'save': True
        },
        {
            'save': False
        },
    ])
    # These properties are all the same:
    assert policy1.authentication is policy2.authentication
    assert policy1.captcha_solver is policy2.captcha_solver
    assert policy1.limits is policy2.limits
    assert policy1.proxy_rules is policy2.proxy_rules
    assert policy1.robots_txt is policy2.robots_txt
    assert policy1.url_normalization is policy2.url_normalization
    assert policy1.url_rules is policy2.url_rules
    assert policy1.user_agents is policy2.user_agents
    # The MIME type rules are different:
    assert policy1.mime_type_rules is not policy2.mime_type_rules
    assert policy1.mime_type_rules.should_save('text/plain')
    assert not policy1.mime_type_rules.should_save('application/json')
    assert not policy2.mime_type_rules.should_save('text/plain')
    assert policy2.mime_type_rules.should_save('application/json')
Пример #8
0
async def test_crawl_extractor(nursery):
    # Create test fixtures.
    job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
    db = Mock()
    db.delete_frontier_item = AsyncMock()
    db.insert_frontier_items = AsyncMock()
    to_extractor, extractor_recv = trio.open_memory_channel(0)
    extractor_send, from_extractor = trio.open_memory_channel(0)
    created_at = datetime(2018,12,31,13,47,00)
    policy_doc = {
        'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
        'name': 'Test',
        'created_at': created_at,
        'updated_at': created_at,
        'authentication': {
            'enabled': False,
        },
        'limits': {
            'max_cost': 10,
            'max_duration': 3600,
            'max_items': 10_000,
        },
        'mime_type_rules': [
            {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
            {'save': False},
        ],
        'proxy_rules': [],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': ['b'],
        },
        'url_rules': [
            {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
             'pattern': '^https?://({SEED_DOMAINS})/'},
            {'action': 'MULTIPLY', 'amount': 0},
        ],
        'user_agents': [
            {'name': 'Test User Agent'}
        ]
    }
    policy = Policy(policy_doc, '1.0.0', ['https://extractor.example'])
    downloader = Mock()
    robots_txt_manager = Mock()
    robots_txt_manager.is_allowed = AsyncMock(return_value=True)
    old_urls = {b'\xd2\x1b\x9b(p-\xed\xb2\x10\xdf\xf0\xa8\xe1\xa2*<'}
    stats_dict = {'frontier_size': 0}
    extractor = CrawlExtractor(job_id, db, extractor_send, extractor_recv,
        policy, downloader, robots_txt_manager, old_urls, stats_dict,
        batch_size=3)
    assert repr(extractor) == '<CrawlExtractor job_id=aaaaaaaa>'
    nursery.start_soon(extractor.run)

    # The HTML document has 5 valid links (enough to create two batches when the
    # `insert_batch` is set to 3) as well as 1 link that's out of domain (should
    # not be added to frontier) and 1 link that's in `old_urls` (also should not
    # be added to frontier).
    html_body = \
    b'''<!DOCTYPE html>
        <html>
            <head><meta charset="UTF-8"><title>Test</title></head>
            <body>
                <a href='http://extractor.example/alpha'>Alpha</a>
                <a href='http://extractor.example/bravo'>Bravo</a>
                <a href='http://extractor.example/charlie'>Charlie</a>
                <a href='http://invalid.example/'>Invalid</a>
                <a href='http://extractor.example/delta'>Delta</a>
                <a href='http://extractor.example/echo'>Echo</a>
                <a href='http://extractor.example/old-url'>Echo</a>
            </body>
        </html>'''
    response = DownloadResponse(
        frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
        cost=1.0,
        url='https://extractor.example',
        canonical_url='https://extractor.example',
        content_type='text/html',
        body=html_body,
        started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
        completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
        exception=None,
        status_code=200,
        headers=dict()
    )
    await to_extractor.send(response)
    await from_extractor.receive()
    # The item should be deleted from the frontier:
    assert db.delete_frontier_item.call_count == 1
    assert db.delete_frontier_item.call_args[0] == \
        'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
    # The insert function should be called twice: once with three items
    # (alpha, bravo charlie), and once with two items (delta, echo).
    assert db.insert_frontier_items.call_count == 2
    assert len(db.insert_frontier_items.call_args[0]) == 2
    assert stats_dict['frontier_size'] == 5
    assert robots_txt_manager.is_allowed.call_count == 6
Пример #9
0
def make_policy(captcha_port=80):
    policy_doc = {
        'id':
        '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
        'name':
        'Test',
        'created_at':
        datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
        'updated_at':
        datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
        'authentication': {
            'enabled': False,
        },
        'captcha_solver': {
            'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8',
            'name': 'Example CAPTCHA',
            'service_url': 'http://127.0.0.1:{}'.format(captcha_port),
            'api_key': None,
            'require_phrase': False,
            'case_sensitive': True,
            'characters': 'ALPHANUMERIC',
            'require_math': False,
        },
        'limits': {
            'max_cost': 10,
            'max_duration': 3600,
            'max_items': 10_000,
        },
        'mime_type_rules': [
            {
                'match': 'MATCHES',
                'pattern': '^text/',
                'save': True
            },
            {
                'save': False
            },
        ],
        'proxy_rules': [],
        'robots_txt': {
            'usage': 'IGNORE',
        },
        'url_normalization': {
            'enabled': True,
            'strip_parameters': [],
        },
        'url_rules': [
            {
                'action': 'ADD',
                'amount': 1,
                'match': 'MATCHES',
                'pattern': '^https?://({SEED_DOMAINS})/'
            },
            {
                'action': 'MULTIPLY',
                'amount': 0
            },
        ],
        'user_agents': [{
            'name': 'Test User Agent'
        }]
    }
    return Policy(policy_doc, '1.0.0', ['https://login.example'])