def get_item(self, content_type=None): grab = Grab(transport=GRAB_TRANSPORT) if content_type is not None: grab.setup(content_type=content_type) grab.fake_response(XML) player = Player(grab.tree) return player
def test_stringfield_multiple(self): grab = Grab(transport=GRAB_TRANSPORT) grab.fake_response(XML) class GameItem(Item): names = StringField('//game/@name', multiple=True) game = GameItem(grab.tree) self.assertEqual(['quake1', 'quake2'], game.names)
class TestUploadContent(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(FORMS, charset='utf-8') def test(self): fc = UploadContent('a') self.assertEqual(fc, 'xxx') self.g.set_input('image', fc)
def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text)
def test_item_interface(self): grab = Grab(transport=GRAB_TRANSPORT) grab.fake_response(XML) player = Player(grab.tree) self.assertEquals(26982032, player.id) self.assertEquals('Ardeshir', player.first_name) self.assertEquals('2012-09-11 07:38:44', str(player.retrieved)) self.assertEquals('75-zoo-1', player.calculated) # should got from cache self.assertEquals('75-zoo-1', player.calculated) # test assigning value player.calculated = 'baz' self.assertEquals('baz', player.calculated) # test FuncField self.assertEquals('75-zoo2-1', player.calculated2) # should got from cache self.assertEquals('75-zoo2-1', player.calculated2) # By default comment_cdata attribute contains empty string # because HTML DOM builder is used by default self.assertEquals('abc', player.comment) self.assertEquals('', player.comment_cdata) # We can control default DOM builder with # content_type option grab = Grab(transport=GRAB_TRANSPORT) grab.fake_response(XML) grab.setup(content_type='xml') player = Player(grab.tree) self.assertEquals('abc', player.comment) self.assertEquals('abc', player.comment_cdata) self.assertRaises(DataNotFound, lambda: player.data_not_found)
class TestHtmlForms(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(FORMS) def test_choose_form(self): """ Test ``choose_form`` method """ # raise errors self.assertRaises(DataNotFound, lambda: self.g.choose_form(10)) self.assertRaises(DataNotFound, lambda: self.g.choose_form(id='bad_id')) self.assertRaises(DataNotFound, lambda: self.g.choose_form(id='fake_form')) self.assertRaises(GrabMisuseError, lambda: self.g.choose_form()) # check results self.g.choose_form(0) self.assertEqual('form', self.g._lxml_form.tag) self.assertEqual('search_form', self.g._lxml_form.get('id')) # reset current form self.g._lxml_form = None self.g.choose_form(id='common_form') self.assertEqual('form', self.g._lxml_form.tag) self.assertEqual('common_form', self.g._lxml_form.get('id')) # reset current form self.g._lxml_form = None self.g.choose_form(name='dummy') self.assertEqual('form', self.g._lxml_form.tag) self.assertEqual('dummy', self.g._lxml_form.get('name')) # reset current form self.g._lxml_form = None self.g.choose_form(xpath='//form[contains(@action, "/dummy")]') self.assertEqual('form', self.g._lxml_form.tag) self.assertEqual('dummy', self.g._lxml_form.get('name')) def assertEqualQueryString(self, qs1, qs2): args1 = set([(x, y[0]) for x, y in parse_qsl(qs1)]) args2 = set([(x, y[0]) for x, y in parse_qsl(qs2)]) self.assertEqual(args1, args2) def test_submit(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = POST_FORM g.go(SERVER.BASE_URL) g.set_input('name', 'Alex') g.submit() self.assertEqualQueryString(SERVER.REQUEST['post'], 'name=Alex&secret=123') # Default submit control SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM g.go(SERVER.BASE_URL) g.submit() self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit1=submit1') # Selected submit control SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM g.go(SERVER.BASE_URL) g.submit(submit_name='submit2') self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit2=submit2') # Default submit control if submit control name is invalid SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM g.go(SERVER.BASE_URL) g.submit(submit_name='submit3') self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit1=submit1') def test_set_methods(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = FORMS g.go(SERVER.BASE_URL) self.assertEqual(g._lxml_form, None) g.set_input('gender', '1') self.assertEqual('common_form', g._lxml_form.get('id')) self.assertRaises(KeyError, lambda: g.set_input('query', 'asdf')) g._lxml_form = None g.set_input_by_id('search_box', 'asdf') self.assertEqual('search_form', g._lxml_form.get('id')) g.choose_form(xpath='//form[@id="common_form"]') g.set_input_by_number(0, 'asdf') g._lxml_form = None g.set_input_by_xpath('//*[@name="gender"]', '2') self.assertEqual('common_form', g._lxml_form.get('id')) def test_html_without_forms(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = NO_FORM_HTML g.go(SERVER.BASE_URL) self.assertRaises(DataNotFound, lambda: g.form) def test_disabled_radio(self): """ Bug #57 """ g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = DISABLED_RADIO_HTML g.go(SERVER.BASE_URL) g.submit(make_request=False)
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] utf_url = url.encode('utf-8') if isinstance(url, unicode) else url if self.cache_key_hash: url_hash = sha1(utf_url).hexdigest() else: url_hash = url cache_item = self.cache.find_one({'_id': url_hash}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url# or grab.config['url'] grab.fake_response(cache_item['body']) if self.use_cache_compression: body = zlib.decompress(cache_item['body']) else: body = cache_item['body'].encode('utf-8') def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = body g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield {'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None} self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response(ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
def test_find(self): grab = Grab(transport=GRAB_TRANSPORT) grab.fake_response(XML) games = list(GameItem.find(grab.doc)) self.assertEqual(['quake1', 'quake2'], [x.name for x in games])
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] cache_item = self.cache.find_one({'_id': url}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) #logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url # or grab.config['url'] grab.fake_response(cache_item['body']) def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = cache_item['body'].encode('utf-8') g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield { 'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None } self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response( ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
class TextExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') def test_search(self): self.assertTrue(self.g.search(u'фыва'.encode('cp1251'), byte=True)) self.assertTrue(self.g.search(u'фыва')) self.assertFalse(self.g.search(u'фыва2')) def test_search_usage_errors(self): self.assertRaises(GrabMisuseError, lambda: self.g.search(u'фыва', byte=True)) self.assertRaises(GrabMisuseError, lambda: self.g.search('фыва')) def test_rex(self): # Search unicode rex in unicode body - default case rex = re.compile(u'(фыва)', re.U) self.assertEqual(u'фыва', self.g.rex(rex).group(1)) # Search non-unicode rex in byte-string body rex = re.compile(u'(фыва)'.encode('cp1251')) self.assertEqual(u'фыва'.encode('cp1251'), self.g.rex(rex, byte=True).group(1)) ## Search for non-unicode rex in unicode body shuld fail rex = re.compile('(фыва)') self.assertRaises(DataNotFound, lambda: self.g.rex(rex)) ## Search for unicode rex in byte-string body shuld fail rex = re.compile(u'фыва', re.U) self.assertRaises(DataNotFound, lambda: self.g.rex(rex, byte=True)) ## Search for unexesting fragment rex = re.compile(u'(фыва2)', re.U) self.assertRaises(DataNotFound, lambda: self.g.rex(rex)) def test_assert_substring(self): self.g.assert_substring(u'фыва') self.g.assert_substring(u'фыва'.encode('cp1251'), byte=True) self.assertRaises(DataNotFound, lambda: self.g.assert_substring(u'фыва2')) def test_assert_substrings(self): self.g.assert_substrings((u'фыва',)) self.g.assert_substrings((u'фывы нет', u'фыва')) self.g.assert_substrings((u'фыва'.encode('cp1251'), 'где ты фыва?'), byte=True) self.assertRaises(DataNotFound, lambda: self.g.assert_substrings((u'фыва, вернись', u'фыва-а-а-а'))) def test_assert_rex(self): self.g.assert_rex(re.compile(u'фыва')) self.g.assert_rex(re.compile(u'фыва'.encode('cp1251')), byte=True) self.assertRaises(DataNotFound, lambda: self.g.assert_rex(re.compile(u'фыва2'))) def test_assert_rex_text(self): self.assertEqual(u'ха', self.g.rex_text('<em id="fly-em">([^<]+)'))
class Yandex(object): def __init__(self): self._g = Grab() self._url = 'http://yandex.ru/yandsearch' self._sep_params = '?' self._sep_prm = '&' self._serp_block_class = 'serp-block' self._serp_item_class = 'serp-item' self.MAX_ITEMS_TO_PAGE = 10 def create_request(self, query, region_id, page=0): req = self._url + self._sep_params + (self._sep_prm.join([ 'lr=' + region_id, 'text=' + query, 'p=' + str(page) ])) return req #for-testing def fetch_data_local(self): self._g.fake_response(open('2.html').read()) def fetch_data(self, req): self._g.go(req) def get_serp(self, query, deep, region_id): items = self._get_items(deep, query, region_id) result = {'positions': [], 'totalResults': '', 'reask_phrase': ''} for en, item in enumerate(items): result['positions'].append({ 'positions': en, 'url': item.url, 'title': item.title, 'mimeType': item.mime, 'snippet': item.snippet }) result['totalResults'] = self._get_total() result['reask_phrase'] = self._get_reask() return result def _get_item(self, item): full_path_url_title = self.__query_path_url_title(item) full_path_snippet = self.__query_path_snippet(item) full_path_mime = self.__query_path_mime(item) urltitle = self._g.doc.select(full_path_url_title) url = urltitle.attr('href') title = urltitle.text() try: snippet = self._g.doc.select(full_path_snippet).text() except DataNotFound: snippet = '' try: mime = self._g.doc.select(full_path_mime).attr('alt') except DataNotFound: mime = 'html' return ItemSerp(url=url, title=title, snippet=snippet, mime=mime) def _get_items(self, deep, query, region): result = [] if deep != 0: count_page = int(math.ceil(float(deep) / self.MAX_ITEMS_TO_PAGE)) for page in xrange(count_page): req = self.create_request(query, region, page) self.fetch_data(req) if page == count_page - 1 and page != 0: count_items = deep % self.MAX_ITEMS_TO_PAGE + 1 else: count_items = self.MAX_ITEMS_TO_PAGE + 1 for item in xrange(1, count_items): result.append(self._get_item(item)) return result def _get_total(self): query_count = self.__full_xpath([ self.__factory_xpath('div', 'input__found') ]) return self._g.doc.select(query_count).text()[2:] def _get_reask(self): query_misspell = self.__full_xpath([ self.__factory_xpath('div', 'misspell'), self.__factory_xpath('div', 'message'), self.__factory_xpath('div', 'misspell__message') ]) try: result = self._g.doc.select(query_misspell).text() except DataNotFound: result = '' return result def __query_to_serp_item_wrap(self, item): query_serp_block = self.__factory_xpath('div', self._serp_block_class) query_serp_block_item = self.__factory_xpath('div', self._serp_item_class, item) query_serp_item_wrap = self.__factory_xpath('div', 'serp-item__wrap') return '/'.join([ query_serp_block, query_serp_block_item, query_serp_item_wrap ]) def __query_path_url_title(self, item): query_serp_item_wrap = self.__query_to_serp_item_wrap(item) query_serp_title = '/'.join([ self.__factory_xpath('h2', 'serp-item__title'), self.__factory_xpath('a', 'b-link serp-item__title-link')]) return self.__full_xpath([ query_serp_item_wrap, query_serp_title ]) def __query_path_snippet(self, item): query_serp_item_wrap = self.__query_to_serp_item_wrap(item) query_serp_snippet = self.__factory_xpath('div', 'serp-item__text') return self.__full_xpath([ query_serp_item_wrap, query_serp_snippet ]) def __query_path_mime(self, item): query_serp_item_wrap = self.__query_to_serp_item_wrap(item) query_serp_item_extra_mine = '/'.join([ self.__factory_xpath('div', 'serp-item__extra-wrap'), self.__factory_xpath('div', 'serp-item__extra'), self.__factory_xpath('a', 'b-link'), self.__factory_xpath('div', 'serp-item__mime'), self.__factory_xpath('img', 'serp-item__mime-icon'), ]) return self.__full_xpath([ query_serp_item_wrap, query_serp_item_extra_mine ]) @classmethod def __full_xpath(cls, xpaths): first = xpaths[0] return '/'.join(['//' + first] + xpaths[1:]) @classmethod def __factory_xpath(cls, tag, clas, index=None): index = '[' + str(index) + ']' if index else '' return '{0}{1}[contains(@class, "{2}")]'.format(tag, index, clas)
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual( self.lxml_tree.xpath('//div[@id="bee"]/div') [0].text_content().strip(), u'пчела') self.assertEqual( self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath( '//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual( 'num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual( '1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual(self.lxml_tree.xpath('//div[@id="bee"]/div')[0].text_content().strip(), u'пчела') self.assertEqual(self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual('num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual('1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')