Python LxmlLinkExtractor примеры, crawlmi.extractor.LxmlLinkExtractor Python примеры использования

Пример #1

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_encoded_url(self):
     body = '''<html><body><div><a href="?page=2">BinB</a></body></html>'''
     response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB'),
     ])

Пример #2

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_deny_extensions(self):
     html = '''<a href="page.html">asd</a> and <a href="photo.jpg">'''
     response = HtmlResponse('http://example.org/', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])

Пример #3

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1')),
        ])

Пример #4

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_deny_extensions(self):
     html = '''<a href="page.html">asd</a> and <a href="photo.jpg">'''
     response = HtmlResponse('http://example.org/', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])

Пример #5

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_link_text_wrong_encoding(self):
     html = '''<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>'''
     response = HtmlResponse('http://www.example.com', body=html, encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
     ])

Пример #6

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_base_url(self):
        html = '''<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html',
                                body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://otherdomain.com/base/item/12.html',
                 text='Item 12')
        ])

        # base url is an absolute path and relative to host
        html = '''<html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html',
                                body=html)
        self.assertEqual(
            lx.extract_links(response),
            [Link(url='https://example.org/item/12.html', text='Item 12')])

        # base url has no scheme
        html = '''<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html',
                                body=html)
        self.assertEqual(lx.extract_links(response), [
            Link(url='https://noschemedomain.com/path/to/item/12.html',
                 text='Item 12')
        ])

Пример #7

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_extraction_encoding_fallback(self):
     body = get_testdata('link_extractor', 'linkextractor_fallback.html')
     response = HtmlResponse(url='http://example.com/fallback', body=body)
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/aktu%C3%A1ln%C3%AD_sd%C4%9Blen%C3%AD.htm', text=''),
     ])

Пример #8

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_urls_type(self):
     '''Test that the resulting urls are regular strings and not a unicode objects.
     '''
     lx = LxmlLinkExtractor()
     self.assertTrue(
         all(
             isinstance(link.url, str)
             for link in lx.extract_links(self.response)))

Пример #9

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_encoded_url(self):
     body = '''<html><body><div><a href="?page=2">BinB</a></body></html>'''
     response = HtmlResponse("http://known.fm/AC%2FDC/",
                             body=body,
                             encoding='utf8')
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB'),
     ])

Пример #10

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_link_nofollow(self):
     html = '''
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     '''
     response = HtmlResponse('http://example.org/page.html', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response),
         [Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
          Link(url='http://example.org/about.html', text=u'About us', nofollow=False)])

Пример #11

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_link_text_wrong_encoding(self):
     html = '''<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>'''
     response = HtmlResponse('http://www.example.com',
                             body=html,
                             encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://www.example.com/item/12.html',
              text=u'Wrong: \ufffd'),
     ])

Пример #12

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_invalid(self):
     # parser shouldn't fail or anything
     html = '''<?xml version="1.0" encoding="utf-8"?>
         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
         <html xmlns="http://www.w3.org/1999/xhtml" lang="cs">
         <body>
         </body>
         </html>'''
     response = HtmlResponse('http://www.example.com', body=html, encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [])

Пример #13

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_invalid(self):
     # parser shouldn't fail or anything
     html = '''<?xml version="1.0" encoding="utf-8"?>
         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
         <html xmlns="http://www.w3.org/1999/xhtml" lang="cs">
         <body>
         </body>
         </html>'''
     response = HtmlResponse('http://www.example.com',
                             body=html,
                             encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [])

Пример #14

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations.
        '''
        lx = LxmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = LxmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = LxmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Пример #15

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_link_nofollow(self):
     html = '''
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     '''
     response = HtmlResponse('http://example.org/page.html', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html?action=print',
              text=u'Printer-friendly page',
              nofollow=True),
         Link(url='http://example.org/about.html',
              text=u'About us',
              nofollow=False)
     ])

Пример #16

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_basic(self):
        html = '''<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/" /></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html', body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertListEqual(lx.extract_links(response),
                             [Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
                              Link(url='http://example.org/about.html', text='About us'),
                              Link(url='http://example.org/othercat.html', text='Other category'),
                              Link(url='http://example.org/', text='>>'),
                              Link(url='http://example.org/', text='')])

Пример #17

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_process_links(self):
        def _process(link):
            if link.url.endswith('othercat.html'):
                return None
            else:
                link.url = 'http://gogo.com/'
                return link

        html = '''<html><head><title>Page title<title>
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/">mimi</a></p>
        <p><a href="/hello">mimino</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/', body=html)
        lx = LxmlLinkExtractor()
        self.assertEqual(lx.extract_links(response, process_links=_process),
                         [Link(url='http://gogo.com/', text=u'>>')])

Пример #18

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_process_links(self):
        def _process(link):
            if link.url.endswith('othercat.html'):
                return None
            else:
                link.url = 'http://gogo.com/'
                return link

        html = '''<html><head><title>Page title<title>
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/">mimi</a></p>
        <p><a href="/hello">mimino</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/', body=html)
        lx = LxmlLinkExtractor()
        self.assertEqual(lx.extract_links(response, process_links=_process), [
            Link(url='http://gogo.com/', text=u'>>')
        ])

Пример #19

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_relative_paths(self):
     html = '''<html><head></head><body>
     <a href="hello/world/"></a>
     <a href="/hello/world/"></a>
     <a href="/hello/././world"></a>
     <a href="/hello/../world"></a>
     <a href="../hello/world"></a>
     <a href="./hello/world"></a>
     <a href="../../hello/././world"></a>
     </body></html>'''
     response = HtmlResponse('http://exmaple.org/yay/', body=html)
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual([link.url for link in lx.extract_links(response)], [
         'http://exmaple.org/yay/hello/world/',
         'http://exmaple.org/hello/world/',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/world',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/yay/hello/world',
         'http://exmaple.org/hello/world',
     ])

Пример #20

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_relative_paths(self):
     html = '''<html><head></head><body>
     <a href="hello/world/"></a>
     <a href="/hello/world/"></a>
     <a href="/hello/././world"></a>
     <a href="/hello/../world"></a>
     <a href="../hello/world"></a>
     <a href="./hello/world"></a>
     <a href="../../hello/././world"></a>
     </body></html>'''
     response = HtmlResponse('http://exmaple.org/yay/', body=html)
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual([link.url for link in lx.extract_links(response)], [
         'http://exmaple.org/yay/hello/world/',
         'http://exmaple.org/hello/world/',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/world',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/yay/hello/world',
         'http://exmaple.org/hello/world',
     ])

Пример #21

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_basic(self):
        html = '''<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/" /></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html',
                                body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertListEqual(lx.extract_links(response), [
            Link(url='http://example.org/somepage/item/12.html',
                 text='Item 12'),
            Link(url='http://example.org/about.html', text='About us'),
            Link(url='http://example.org/othercat.html',
                 text='Other category'),
            Link(url='http://example.org/', text='>>'),
            Link(url='http://example.org/', text='')
        ])

Пример #22

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
        ])

Пример #23

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_base_url(self):
        html = '''<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html', body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])

        # base url is an absolute path and relative to host
        html = '''<html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html', body=html)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='https://example.org/item/12.html', text='Item 12')])

        # base url has no scheme
        html = '''<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html', body=html)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])

Пример #24

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_url_allowed(self):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

        lx = LxmlLinkExtractor(allow=(r'stuff1', ))
        self.assertEqual(lx.url_allowed(url1), True)
        self.assertEqual(lx.url_allowed(url2), False)

        lx = LxmlLinkExtractor(deny=(r'uglystuff', ))
        self.assertEqual(lx.url_allowed(url1), True)
        self.assertEqual(lx.url_allowed(url2), False)

        lx = LxmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
        self.assertEqual(lx.url_allowed(url1), False)
        self.assertEqual(lx.url_allowed(url2), True)

        lx = LxmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
        self.assertEqual(lx.url_allowed(url1), False)
        self.assertEqual(lx.url_allowed(url2), True)

        lx = LxmlLinkExtractor(allow=('blah1', ),
                               deny=('blah2', ),
                               allow_domains=('blah1.com', ),
                               deny_domains=('blah2.com', ))
        self.assertEqual(lx.url_allowed('http://blah1.com/blah1'), True)
        self.assertEqual(lx.url_allowed('http://blah1.com/blah2'), False)
        self.assertEqual(lx.url_allowed('http://blah2.com/blah1'), False)
        self.assertEqual(lx.url_allowed('http://blah2.com/blah2'), False)

Пример #25

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

 def test_empty_body(self):
     lx = LxmlLinkExtractor()
     response = HtmlResponse('http://www.example.com')
     self.assertEqual(lx.extract_links(response), [])
     response = HtmlResponse('http://www.example.com', body='\n\r\n\n')
     self.assertEqual(lx.extract_links(response), [])

Пример #26

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_empty_body(self):
     lx = LxmlLinkExtractor()
     response = HtmlResponse('http://www.example.com')
     self.assertEqual(lx.extract_links(response), [])
     response = HtmlResponse('http://www.example.com', body='\n\r\n\n')
     self.assertEqual(lx.extract_links(response), [])

Пример #27

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_extraction(self):
        '''Test the extractor's behaviour among different situations.
        '''
        lx = LxmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://example.com/sample3.html',
                 text=u'sample 3 repetition'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = LxmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Пример #28

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: dahuangfeng123/crawlmi

    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations.
        '''
        lx = LxmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = LxmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = LxmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Пример #29

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_url_allowed(self):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

        lx = LxmlLinkExtractor(allow=(r'stuff1', ))
        self.assertEqual(lx.url_allowed(url1), True)
        self.assertEqual(lx.url_allowed(url2), False)

        lx = LxmlLinkExtractor(deny=(r'uglystuff', ))
        self.assertEqual(lx.url_allowed(url1), True)
        self.assertEqual(lx.url_allowed(url2), False)

        lx = LxmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
        self.assertEqual(lx.url_allowed(url1), False)
        self.assertEqual(lx.url_allowed(url2), True)

        lx = LxmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
        self.assertEqual(lx.url_allowed(url1), False)
        self.assertEqual(lx.url_allowed(url2), True)

        lx = LxmlLinkExtractor(allow=('blah1',), deny=('blah2',),
                               allow_domains=('blah1.com',),
                               deny_domains=('blah2.com',))
        self.assertEqual(lx.url_allowed('http://blah1.com/blah1'), True)
        self.assertEqual(lx.url_allowed('http://blah1.com/blah2'), False)
        self.assertEqual(lx.url_allowed('http://blah2.com/blah1'), False)
        self.assertEqual(lx.url_allowed('http://blah2.com/blah2'), False)

Пример #30

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

 def test_urls_type(self):
     '''Test that the resulting urls are regular strings and not a unicode objects.
     '''
     lx = LxmlLinkExtractor()
     self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))

Пример #31

0

Показать файл

Файл: test_lxml_link_extractor.py Проект: Mimino666/crawlmi

    def test_extraction(self):
        '''Test the extractor's behaviour among different situations.
        '''
        lx = LxmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = LxmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
        ])

        lx = LxmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://www.google.com/something', text=u''),
        ])

Python LxmlLinkExtractor примеры использования