class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual( self.lxml_tree.xpath('//div[@id="bee"]/div') [0].text_content().strip(), u'пчела') self.assertEqual( self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath( '//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual( 'num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual( '1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')
from grab import Grab import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.go('http://habrahabr.ru') g.xpath('//h2/a[@class="topic"]').get('href') print(g.xpath_text('//h2/a[@class="topic"]')) print(g.css_text('h2 a.topic')) print('Comments:', g.css_number('.comments .all')) from urllib.parse import urlsplit print(', '.join(urlsplit(x.get('href')).netloc for x in g.css_list('.hentry a') if not 'habrahabr.ru' in x.get('href') and x.get('href').startswith('http:')))
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual(self.lxml_tree.xpath('//div[@id="bee"]/div')[0].text_content().strip(), u'пчела') self.assertEqual(self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual('num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual('1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')