def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1')),
        ])
Exemplo n.º 2
0
 def test_extraction_encoding_fallback(self):
     body = get_testdata('link_extractor', 'linkextractor_fallback.html')
     response = HtmlResponse(url='http://example.com/fallback', body=body)
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/aktu%C3%A1ln%C3%AD_sd%C4%9Blen%C3%AD.htm', text=''),
     ])
Exemplo n.º 3
0
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
        ])
Exemplo n.º 4
0
    def test_css(self):
        body = get_testdata('pages', 'ip_page.html')
        response = HtmlResponse(url='http://myip.com/list', body=body)
        hxs = response.selector

        valid_ts = S('_',
                     css='div#main',
                     quant='1',
                     children=[
                         S('all_ip', css='span.ip', quant='7'),
                         S('_',
                           css='ul#ip_list',
                           quant='1',
                           children=[S('list_ip', css='span.ip', quant='6')]),
                     ])
        parsed = valid_ts.parse(hxs)
        self.assertRaises(TypeError, S, '_')
        self.assertRaises(TypeError, S, '_', 'div[@id="main]', css='div#main')
Exemplo n.º 5
0
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
Exemplo n.º 6
0
    def test_xpath(self):
        body = get_testdata('pages', 'ip_page.html')
        response = HtmlResponse(url='http://myip.com/list', body=body)
        hxs = response.selector

        # test valid parsing
        valid_ts = S(
            '_',
            '//div[@id="main"]',
            quant='1',
            children=[
                S('title', 'h1', quant='1', value='text()'),
                S('full_title_script',
                  'h1|div[@id="subtitle"]/h2',
                  quant='2',
                  value='descendant-or-self::text()'),
                # although the following statement gives the same elements, it gives them in different order
                # S('full_title_script', '(h1|div[@id="subtitle"]/h2)/descendant-or-self::*', quant='+', value='text()'),
                S('full_title_no_script',
                  'h1|div[@id="subtitle"]/h2',
                  quant='2',
                  value='descendant-or-self::*[name()!="script"]/text()'),
                S('full_title_script_bad',
                  '(h1|div[@id="subtitle"]/h2)//*',
                  quant='+',
                  value='text()'),
                S('_list',
                  'ul[@id="ip_list"]',
                  quant='1',
                  children=[
                      S('_ips',
                        'li',
                        quant='6',
                        group='ips',
                        children=[
                            S('ip',
                              'span[@class="ip"]',
                              quant='1',
                              value='text()'),
                            S('port',
                              'span[@class="port"]',
                              quant='1',
                              value='text()'),
                            S('ip_port',
                              'self::*',
                              value='descendant-or-self::text()'),
                        ])
                  ]),
                S('url',
                  'descendant-or-self::a',
                  quant='1',
                  value='@href',
                  callback=S.absolute_url),
                S('empty', 'div[@id="empty"]', quant='1', value='text()'),
                S('footer',
                  'following-sibling::div[@id="footer"]',
                  quant='1',
                  children=[
                      S('footer_links',
                        'a',
                        quant='+',
                        value='@href',
                        callback=S.absolute_url)
                  ]),
                S('nonexistent', 'div/div/div', quant='?', value='text()')
            ])

        # validation without context, when context is expected
        self.assertRaises(SValidationError, valid_ts.parse, hxs)

        parsed = valid_ts.parse(response)

        self.assertItemsEqual(parsed, [
            'title', 'full_title_script', 'full_title_no_script',
            'full_title_script_bad', 'ips', 'url', 'empty', 'footer',
            'footer_links'
        ])  # nonexistent is missing!

        # title
        self.assertIsInstance(parsed['title'], list)
        self.assertListEqual(parsed['title'],
                             [u'Here is the list of some ', u' addresses '
                              ])  # text inside strong is not pased
        # full_title
        self.assertListEqual(parsed['full_title_script'], [
            u'Here is the list of some ', u'ip', u' addresses ', u'!!!',
            u'Just ', u'some', u' ', u'this is bad', u' other text.'
        ])  # order of text nodes is perserved
        self.assertListEqual(parsed['full_title_no_script'], [
            u'Here is the list of some ', u'ip', u' addresses ', u'!!!',
            u'Just ', u'some', u' ', u' other text.'
        ])  # same result as before, excluding script content
        self.assertListEqual(parsed['full_title_script_bad'],
                             [u'ip', u'!!!', u'some', u'this is bad'
                              ])  # this only took the inner nodes
        # ips
        self.assertIsInstance(parsed['ips'], list)
        self.assertEqual(len(parsed['ips']), 6)
        first = parsed['ips'][0]
        self.assertIsInstance(first, defaultdict)
        self.assertItemsEqual(first, ['ip', 'port', 'ip_port'])
        self.assertListEqual(first['ip'], [u'123.44.1.9'])
        self.assertIsInstance(first['ip'][0],
                              unicode)  # parsed objects are always unicode
        self.assertListEqual(first['port'], [u'80'])
        self.assertListEqual(first['ip_port'], [u'123.44.1.9', u':', u'80'])
        # url
        self.assertListEqual(parsed['url'], [u'http://myip.com/url1'])
        self.assertIsInstance(
            parsed['url'][0],
            unicode)  # even urls are unicode after being processed
        # empty
        self.assertListEqual(
            parsed['empty'], []
        )  # even though we matched 1 tag empty, the was no text and the returned list is empty
        # footer
        self.assertIsInstance(parsed['footer'][0], HtmlXPathSelector)
        self.assertListEqual(parsed['footer_links'],
                             [u'http://myip.com/url2', u'http://google.com/'])
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
Exemplo n.º 8
0
    def test_ip_page(self):
        '''
        Test selector on ip_page.html.
        '''

        body = get_testdata('pages', 'ip_page.html')
        response = HtmlResponse(url='http://myip.com/list', body=body)
        hxs = response.selector

        # test valid parsing
        valid_ts = S('_', '//div[@id="main"]', quant='1', children=[
            S('title', 'h1', quant='1', value='text()'),
            S('full_title_script', 'h1|div[@id="subtitle"]/h2', quant='2', value='descendant-or-self::text()'),
            # although the following statement gives the same elements, it gives them in different order
            # S('full_title_script', '(h1|div[@id="subtitle"]/h2)/descendant-or-self::*', quant='+', value='text()'),
            S('full_title_no_script', 'h1|div[@id="subtitle"]/h2', quant='2', value='descendant-or-self::*[name()!="script"]/text()'),
            S('full_title_script_bad', '(h1|div[@id="subtitle"]/h2)//*', quant='+', value='text()'),

            S('_list', 'ul[@id="ip_list"]', quant='1', children=[
                S('_ips', 'li', quant='6', group='ips', children=[
                    S('ip', 'span[@class="ip"]', quant='1', value='text()'),
                    S('port', 'span[@class="port"]', quant='1', value='text()'),
                    S('ip_port', 'self::*', value='descendant-or-self::text()'),
                ])
            ]),

            S('url', 'descendant-or-self::a', quant='1', value='@href', callback=absolute_url),
            S('empty', 'div[@id="empty"]', quant='1', value='text()'),
            S('footer', 'following-sibling::div[@id="footer"]', quant='1', children=[
                S('footer_links', 'a', quant='+', value='@href', callback=absolute_url)
            ]),
            S('nonexistent', 'div/div/div', quant='?', value='text()')
        ])
        context = {'response': response}

        self.assertTrue(valid_ts.xpath_exists(hxs))

        # validation without context, when context is expected
        self.assertRaises(SValidationError, valid_ts.parse, hxs)

        parsed = valid_ts.parse(hxs, context)

        self.assertItemsEqual(parsed, ['title', 'full_title_script', 'full_title_no_script', 'full_title_script_bad', 'ips',
            'url', 'empty', 'footer', 'footer_links'])  # nonexistent is missing!

        # title
        self.assertIsInstance(parsed['title'], list)
        self.assertListEqual(parsed['title'], [u'Here is the list of some ', u' addresses '])  # text inside strong is not pased
        # full_title
        self.assertListEqual(parsed['full_title_script'], [u'Here is the list of some ', u'ip',
            u' addresses ', u'!!!', u'Just ', u'some', u' ', u'this is bad', u' other text.'])  # order of text nodes is perserved
        self.assertListEqual(parsed['full_title_no_script'], [u'Here is the list of some ', u'ip',
            u' addresses ', u'!!!', u'Just ', u'some', u' ', u' other text.'])  # same result as before, excluding script content
        self.assertListEqual(parsed['full_title_script_bad'], [u'ip', u'!!!', u'some', u'this is bad'])  # this only took the inner nodes
        # ips
        self.assertIsInstance(parsed['ips'], list)
        self.assertEqual(len(parsed['ips']), 6)
        first = parsed['ips'][0]
        self.assertIsInstance(first, defaultdict)
        self.assertItemsEqual(first, ['ip', 'port', 'ip_port'])
        self.assertListEqual(first['ip'], [u'123.44.1.9'])
        self.assertIsInstance(first['ip'][0], unicode)  # parsed objects are always unicode
        self.assertListEqual(first['port'], [u'80'])
        self.assertListEqual(first['ip_port'], [u'123.44.1.9', u':', u'80'])
        # url
        self.assertListEqual(parsed['url'], [u'http://myip.com/url1'])
        self.assertIsInstance(parsed['url'][0], unicode)  # even urls are unicode after being processed
        # empty
        self.assertListEqual(parsed['empty'], [])  # even though we matched 1 tag empty, the was no text and the returned list is empty
        # footer
        self.assertIsInstance(parsed['footer'][0], HtmlXPathSelector)
        self.assertListEqual(parsed['footer_links'], [u'http://myip.com/url2', u'http://google.com/'])
Exemplo n.º 9
0
 def setUp(self):
     body = get_testdata('pages', 'ip_page.html')
     response = HtmlResponse(url='http://myip.com/list', body=body)
     hxs = response.selector
     self.parsed = basic_ts.parse(hxs)
Exemplo n.º 10
0
 def setUp(self):
     body = get_testdata('pages', 'ip_page.html')
     response = HtmlResponse(url='http://myip.com/list', body=body)
     hxs = response.selector
     self.parsed = basic_ts.parse(hxs)