def test_get_response_text(self): text = 'Example' response = FakedObject() response.read = lambda: text result = self.linkExtractor._get_response_text(response, func_name='read') self.assertTrue(isinstance(result, str)) self.assertEqual(text, result)
def test_get_response(self): from common_crawler.http import Response from lxml.etree import _Element async def text(): return 'HTML' response = FakedObject(url=_TARGET_URL, status=200, charset='utf-8', content_type='text/html', content_length=233, reason='OK', headers={'connection': 'keep-alive'}) response.text = text async def work(): async with AioHttpClient() as client: expected = await client.get_response(response) self.assertTrue(isinstance(expected, Response)) self.assertTrue(isinstance(expected.url, str)) self.assertEqual(_TARGET_URL, expected.url) self.assertTrue(isinstance(expected.text, str)) self.assertEqual('HTML', expected.text) self.assertTrue(isinstance(expected.status, int)) self.assertEqual(expected.status, 200) self.assertTrue(isinstance(expected.charset, str)) self.assertEqual(expected.charset, 'utf-8') self.assertTrue(isinstance(expected.content_type, str)) self.assertEqual(expected.content_type, 'text/html') self.assertTrue(isinstance(expected.content_length, int)) self.assertEqual(expected.content_length, 233) self.assertTrue(isinstance(expected.reason, str)) self.assertEqual(expected.reason, 'OK') self.assertTrue(isinstance(expected.headers, dict)) self.assertTrue('connection' in expected.headers) self.assertEqual(expected.headers['connection'], 'keep-alive') self.assertTrue(isinstance(expected.selector, _Element)) _LOOP.run_until_complete(work())
def test_clean_for_finished_urls(self): engine = self._get_default_engine() engine.crawler.finished_urls = [FakedObject(url='f'), FakedObject(url='g'), None, FakedObject(url='d'), None, FakedObject(url='a')] engine._clean_for_finished_urls() self.assertEqual(len(engine.crawler.finished_urls), 4) self.assertEqual(engine.crawler.finished_urls[0].url, 'a') self.assertEqual(engine.crawler.finished_urls[1].url, 'd') self.assertEqual(engine.crawler.finished_urls[2].url, 'f') self.assertEqual(engine.crawler.finished_urls[3].url, 'g')
def test_restrict_css(self): css_name_01 = "container" css_name_02 = "btn" url_01 = "https://www.python.org" url_02 = "http://www.google.com" html = """ <html> <head></head> <body> <div id="div01" value="%s" class="%s">div01</div> <div id="div02" value="%s" class="%s">div02</div> <div id="div03">div03</div> </body> </html> """ % (url_01, css_name_01, url_02, css_name_02) linkExtractor = LxmlLinkExtractor(tags=('div', ), attrs=('value', ), restrict_css=('.%s' % css_name_01, '.%s' % css_name_02)) response = FakedObject(url="https://www.example.com", text=html) links = linkExtractor.extract_links(response) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, url_01) self.assertEqual(links[0].text, 'div01') self.assertEqual(links[1].url, url_02) self.assertEqual(links[1].text, 'div02')
def setUp(self): self.response = FakedObject(url=_URL, status=_STATUS, headers=_HEADERS, charset=_CHARSET, content_type=_CONTENT_TYPE, content_length=_CONTENT_LENGTH, reason=_REASON) self.response.text = _TEXT
def add_to_task_queue(self, url): if isinstance(url, str): url = [url] for u in url: self.task_queue.put_nowait(FakedObject( url=u, parsed_data=None, exception=None, redirect_num=0, retries_num=0, redirect_url=None, response=FakedObject(status=200, charset='utf-8', content_type='text/html', content_length=None, reason='OK', headers=None) ))
def test_head(self, mocked): status = 200 mocked.return_value = FakedObject(status=status) async def work(): async with AioHttpClient() as client: async with client.head(url=_TARGET_URL) as resp: self.assertEqual(status, resp.status) _LOOP.run_until_complete(work()) mocked.assert_called_once_with(url=_TARGET_URL)
def setUp(self): self.configuration = { 'name': 'common_crawler', 'roots': ('http://www.example.com',), 'deny_domains': (), 'allow_domains': (), 'strict': True, 'follow': True, 'allowed_rule': (), 'denied_rule': (), 'log_level': 2, 'log_filename': None, 'log_format': '%(asctime)s:%(levelname)s:%(message)s', 'log_init_fn': 'common_crawler.engines._init_logging', 'max_redirect': 10, 'max_retries': 4, 'max_tasks': 100, 'interval': 1 } self.task = FakedObject(url='https://www.google.com', parsed_data=None, exception=None, redirect_num=0, retries_num=0, redirect_url=None, ) self.task.response = FakedObject(url=self.task.url, status=200, charset='utf-8', content_type='text/html', content_length=None, reason='OK', headers=None, text='<html><body><a href="/abc"></a></body></html>') self.task_queue = asyncio.Queue() self.http_client = FakedObject() self.parse_link = lambda x: print('Parsing...')
def setUp(self): self.open_path = 'builtins.open' url = 'https://www.example.com' response = FakedObject(url=url, status=200, charset='utf-8', content_type=None, content_length=None, reason=None, headers=None, text=None) self.task = Task(url=url, parsed_data='Text', response=response)
def test_crawl_as_redirect(self, mocked): mocked.return_value = FakedObject( url=_URL, status=301, headers={'location': 'https://www.python.org'}, charset=_CHARSET, content_type=_CONTENT_TYPE, content_length=_CONTENT_LENGTH, reason=_REASON, text=_TEXT) list = [] async def work(crawler): async for t in crawler.crawl(): list.append(t) crawler = AsyncCrawler(roots=_URL) launcher = AsyncCrawlerLauncher(crawler=crawler, work=work) launcher.run() self.assertEqual(0, len(list))
def setUp(self): allow = ('http[s]?:\/\/www.google.com\/?\w*') deny = ('http[s]?:\/\/www.google.com\/?hello') deny_domains = ['www.amazon.com'] deny_extensions = ['mp3', 'pdf', 'ppt'] html = ''' <html> <head></head> <body> <a href="https://www.google.com/">Google</a> <a href="https://www.google.com/">Google2</a> <a href="/python">Python</a> <a href="/world">World</a> </body> </html> ''' response = FakedObject(url='https://www.google.com', text=html) self.response = response self.linkExtractor = LxmlLinkExtractor(allow=allow, deny=deny, deny_domains=deny_domains, deny_extensions=deny_extensions)
def __init__(self, **kwargs): super(self.__class__, self).__init__(**kwargs) self.return_val = FakedObject(url='https://www.link_extractor.com') self.count = 0