예제 #1
0
    def _extract_links(self, selector, response_url, response_encoding,
                       base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector._root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            attr_val = urljoin(base_url, attr_val)
            url = self.process_attr(attr_val)
            if url is None:
                continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(
                url,
                _collect_string_content(el) or u'',
                nofollow=True if el.get('rel') == 'nofollow' else False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links
예제 #2
0
 def _extract_links(self, selector, response_url, response_encoding,
                    base_url):
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue  # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         url = to_native_str(url, encoding=response_encoding)
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url,
                     _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
예제 #3
0
        def test_extract_all_links(self):
            lx = self.extractor_cls()
            if self.escapes_whitespace:
                page4_url = 'http://example.com/page%204.html'
            else:
                page4_url = 'http://example.com/page 4.html'

            self.assertEqual(
                [link for link in lx.extract_links(self.response)], [
                    Link(url='http://example.com/sample1.html', text=u''),
                    Link(url='http://example.com/sample2.html',
                         text=u'sample 2'),
                    Link(url='http://example.com/sample3.html',
                         text=u'sample 3 text'),
                    Link(url='http://example.com/sample3.html#foo',
                         text='sample 3 repetition with fragment'),
                    Link(url='http://www.google.com/something', text=u''),
                    Link(url='http://example.com/innertag.html',
                         text=u'inner tag'),
                    Link(url=page4_url, text=u'href with whitespaces'),
                ])
예제 #4
0
def _extract_links(self, selector, response_url, response_encoding, base_url):
    links = []
    # hacky way to get the underlying lxml parsed document
    for el, attr, attr_val in self._iter_links(selector.root):
        if self.scan_tag(el.tag) and self.scan_attr(attr):
            # pseudo root.make_links_absolute(base_url)
            # START PATCH: Added check to filter links before making absolute
            if not _is_valid_link(attr_val):
                continue
            # END PATCH
            attr_val = urljoin(base_url, attr_val)
            url = self.process_attr(attr_val)
            if url is None:
                continue
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(
                url, _collect_string_content(el) or '',
                nofollow=True if el.get('rel') == 'nofollow' else False
            )
            links.append(link)
    return unique_list(links, key=lambda link: link.url) \
        if self.unique else links
예제 #5
0
    def extract_links(self, response):
        hxs = Selector(response)
        list_css = self.get_css("list_css")
        if not list_css:
            return []

        urls = []
        try:
            links = hxs.css(list_css).xpath('@href').extract()
            for url in links:
                urls.append(url)
            next_url = self.extract_next_links(response)
            urls.extend(next_url)
        except Exception as err:
            self.logger.error("%s" % err)

        rtn = []
        for url in urls:
            url = URL.s_get_full_url(URL(url), URL(response.url))
            if url:
                rtn.append(Link(url=url))

        return rtn
예제 #6
0
    def extract_links(self, response):
        base_url = get_base_url(response)

        if self.restrict_xpaths:
            links = [
                link for xpath in self.restrict_xpaths
                for link in response.xpath(xpath)
            ]
        else:
            links = [
                response.selector,
            ]

        all_links = [
            Link(response.url),
        ]

        for link in links:
            new_link = self._extract_links(link, response.url,
                                           response.encoding, base_url)
            all_links.extend(self._process_links(new_link))

        return unique_list(all_links)
예제 #7
0
        def test_extract_all_links(self):
            lx = self.extractor_cls()
            page4_url = "http://example.com/page%204.html"

            self.assertEqual(
                [link for link in lx.extract_links(self.response)],
                [
                    Link(url="http://example.com/sample1.html", text=""),
                    Link(url="http://example.com/sample2.html",
                         text="sample 2"),
                    Link(url="http://example.com/sample3.html",
                         text="sample 3 text"),
                    Link(
                        url="http://example.com/sample3.html#foo",
                        text="sample 3 repetition with fragment",
                    ),
                    Link(url="http://www.google.com/something", text=""),
                    Link(url="http://example.com/innertag.html",
                         text="inner tag"),
                    Link(url=page4_url, text="href with whitespaces"),
                ],
            )
예제 #8
0
    def test_eq_and_hash(self):
        l1 = Link("http://www.example.com")
        l2 = Link("http://www.example.com/other")
        l3 = Link("http://www.example.com")

        self.assertEqual(l1, l1)
        self.assertEqual(hash(l1), hash(l1))
        self.assertNotEqual(l1, l2)
        self.assertNotEqual(hash(l1), hash(l2))
        self.assertEqual(l1, l3)
        self.assertEqual(hash(l1), hash(l3))

        l4 = Link("http://www.example.com", text="test")
        l5 = Link("http://www.example.com", text="test2")
        l6 = Link("http://www.example.com", text="test")

        self.assertEqual(l4, l4)
        self.assertEqual(hash(l4), hash(l4))
        self.assertNotEqual(l4, l5)
        self.assertNotEqual(hash(l4), hash(l5))
        self.assertEqual(l4, l6)
        self.assertEqual(hash(l4), hash(l6))
예제 #9
0
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
        ])
 def test_extract_filter_allowed_domains(self):
     lx = self.extractor_cls(allow_domains=('google.com', ))
     self.assertEqual(
         [link for link in lx.extract_links(self.response)], [
             Link(url='http://www.google.com/something', text=u''),
         ])
예제 #11
0
파일: html.py 프로젝트: tomzhang/portia
 def mklink(url, anchortext=None, nofollow=False):
     url = url.strip()
     fullurl = urljoin(base_href, replace_entities(url, encoding=htmlpage.encoding))
     return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)
    def test_extraction(self):
        '''Test the extractor's behaviour among different situations'''

        lx = HTMLImageLinkExtractor(locations=('//img', ))
        links_1 = lx.extract_links(self.response)
        self.assertEqual(links_1, [
            Link(url='http://example.com/sample1.jpg', text=u'sample 1'),
            Link(url='http://example.com/sample2.jpg', text=u'sample 2'),
            Link(url='http://example.com/sample4.jpg', text=u'sample 4')
        ])

        lx = HTMLImageLinkExtractor(locations=('//img', ), unique=False)
        links_2 = lx.extract_links(self.response)
        self.assertEqual(links_2, [
            Link(url='http://example.com/sample1.jpg', text=u'sample 1'),
            Link(url='http://example.com/sample2.jpg', text=u'sample 2'),
            Link(url='http://example.com/sample4.jpg', text=u'sample 4'),
            Link(url='http://example.com/sample4.jpg',
                 text=u'sample 4 repetition')
        ])

        lx = HTMLImageLinkExtractor(locations=('//div[@id="wrapper"]', ))
        links_3 = lx.extract_links(self.response)
        self.assertEqual(links_3, [
            Link(url='http://example.com/sample1.jpg', text=u'sample 1'),
            Link(url='http://example.com/sample2.jpg', text=u'sample 2'),
            Link(url='http://example.com/sample4.jpg', text=u'sample 4')
        ])

        lx = HTMLImageLinkExtractor(locations=('//a', ))
        links_4 = lx.extract_links(self.response)
        self.assertEqual(links_4, [
            Link(url='http://example.com/sample2.jpg', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3')
        ])
예제 #13
0
 def test_follow_whitespace_link(self):
     self._assert_followed_url(Link('http://example.com/foo '),
                               'http://example.com/foo%20')
    def test_extraction(self):
        '''Test the extractor's behaviour among different situations'''

        lx = SgmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u'')
        ])

        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text')
        ])

        lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://example.com/sample3.html',
                 text=u'sample 3 repetition')
        ])

        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
        ])

        lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)], [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2')
        ])

        lx = SgmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual(
            [link for link in lx.extract_links(self.response)],
            [Link(url='http://www.google.com/something', text=u'')])
예제 #15
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
예제 #16
0
 def test_unicode_url(self):
     with warnings.catch_warnings(record=True) as w:
         link = Link(u"http://www.example.com/\xa3")
         self.assertIsInstance(link.url, bytes)
         self.assertEqual(link.url, b'http://www.example.com/\xc2\xa3')
         assert len(w) == 1, "warning not issued"
예제 #17
0
 def test_extract_filter_allow_and_deny(self):
     lx = self.extractor_cls(allow=('sample', ), deny=('3', ))
     self.assertEqual([link for link in lx.extract_links(self.response)], [
         Link(url='http://example.com/sample1.html', text=''),
         Link(url='http://example.com/sample2.html', text='sample 2'),
     ])
예제 #18
0
 def extract_links(self, response):
     if not self.base_url:
         self.base_url = get_base_url(response)
     items = re.findall(self.restrict_re, response.text)
     all_links = [Link(response.urljoin(self.base_url.format(str(item)))) for item in items]
     return unique_list(all_links)
예제 #19
0
파일: ecsv.py 프로젝트: zsmj513/portia
 def _extract_links(self, response):
     buff = StringIO(response.body)
     reader = csv.reader(buff, **self.fmtparams)
     for row in reader:
         if len(row) > self.column:
             yield Link(row[self.column])
예제 #20
0
파일: xml.py 프로젝트: stevemarcus/slybot
 def _extract_links(self, response):
     xxs = XmlXPathSelector(response)
     for url in xxs.select(self.xpath).extract():
         yield Link(url.encode(response.encoding))
 def test_restrict_css(self):
     lx = self.extractor_cls(restrict_css=('#subwrapper a', ))
     self.assertEqual(lx.extract_links(self.response), [
         Link(url='http://example.com/sample2.html', text=u'sample 2')
     ])
예제 #22
0
 def mklink(url, anchortext=None, nofollow=False):
     url = url.strip()
     path = remove_entities(url, encoding=encoding)
     return Link(urljoin(base_href, path).encode(encoding),
                 text=anchortext, nofollow=nofollow)
        def test_attrs(self):
            lx = self.extractor_cls(attrs="href")
            page4_url = 'http://example.com/page%204.html'

            self.assertEqual(lx.extract_links(self.response), [
                Link(url='http://example.com/sample1.html', text=u''),
                Link(url='http://example.com/sample2.html', text=u'sample 2'),
                Link(url='http://example.com/sample3.html',
                     text=u'sample 3 text'),
                Link(url='http://example.com/sample3.html#foo',
                     text='sample 3 repetition with fragment'),
                Link(url='http://www.google.com/something', text=u''),
                Link(url='http://example.com/innertag.html',
                     text=u'inner tag'),
                Link(url=page4_url, text=u'href with whitespaces'),
            ])

            lx = self.extractor_cls(attrs=("href", "src"),
                                    tags=("a", "area", "img"),
                                    deny_extensions=())
            self.assertEqual(lx.extract_links(self.response), [
                Link(url='http://example.com/sample1.html', text=u''),
                Link(url='http://example.com/sample2.html', text=u'sample 2'),
                Link(url='http://example.com/sample2.jpg', text=u''),
                Link(url='http://example.com/sample3.html',
                     text=u'sample 3 text'),
                Link(url='http://example.com/sample3.html#foo',
                     text='sample 3 repetition with fragment'),
                Link(url='http://www.google.com/something', text=u''),
                Link(url='http://example.com/innertag.html',
                     text=u'inner tag'),
                Link(url=page4_url, text=u'href with whitespaces'),
            ])

            lx = self.extractor_cls(attrs=None)
            self.assertEqual(lx.extract_links(self.response), [])
예제 #24
0
 def test_restrict_css(self):
     lx = self.extractor_cls(restrict_css=("#subwrapper a", ))
     self.assertEqual(
         lx.extract_links(self.response),
         [Link(url="http://example.com/sample2.html", text="sample 2")],
     )
예제 #25
0
    def test_xhtml(self):
        xhtml = """
<?xml version="1.0"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <title>XHTML document title</title>
</head>
<body>
    <div class='links'>
    <p><a href="/about.html">About us</a></p>
    </div>
    <div>
    <p><a href="/follow.html">Follow this link</a></p>
    </div>
    <div>
    <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
    </div>
    <div>
    <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
    </div>
</body>
</html>
        """

        response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)

        lx = self.extractor_cls()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/about.html',
                 text=u'About us',
                 fragment='',
                 nofollow=False),
            Link(url='http://example.com/follow.html',
                 text=u'Follow this link',
                 fragment='',
                 nofollow=False),
            Link(url='http://example.com/nofollow.html',
                 text=u'Dont follow this one',
                 fragment='',
                 nofollow=True),
            Link(url='http://example.com/nofollow2.html',
                 text=u'Choose to follow or not',
                 fragment='',
                 nofollow=False)
        ])

        response = XmlResponse("http://example.com/index.xhtml", body=xhtml)

        lx = self.extractor_cls()
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/about.html',
                 text=u'About us',
                 fragment='',
                 nofollow=False),
            Link(url='http://example.com/follow.html',
                 text=u'Follow this link',
                 fragment='',
                 nofollow=False),
            Link(url='http://example.com/nofollow.html',
                 text=u'Dont follow this one',
                 fragment='',
                 nofollow=True),
            Link(url='http://example.com/nofollow2.html',
                 text=u'Choose to follow or not',
                 fragment='',
                 nofollow=False)
        ])
예제 #26
0
        def test_attrs(self):
            lx = self.extractor_cls(attrs="href")
            page4_url = "http://example.com/page%204.html"

            self.assertEqual(
                lx.extract_links(self.response),
                [
                    Link(url="http://example.com/sample1.html", text=""),
                    Link(url="http://example.com/sample2.html",
                         text="sample 2"),
                    Link(url="http://example.com/sample3.html",
                         text="sample 3 text"),
                    Link(
                        url="http://example.com/sample3.html#foo",
                        text="sample 3 repetition with fragment",
                    ),
                    Link(url="http://www.google.com/something", text=""),
                    Link(url="http://example.com/innertag.html",
                         text="inner tag"),
                    Link(url=page4_url, text="href with whitespaces"),
                ],
            )

            lx = self.extractor_cls(attrs=("href", "src"),
                                    tags=("a", "area", "img"),
                                    deny_extensions=())
            self.assertEqual(
                lx.extract_links(self.response),
                [
                    Link(url="http://example.com/sample1.html", text=""),
                    Link(url="http://example.com/sample2.html",
                         text="sample 2"),
                    Link(url="http://example.com/sample2.jpg", text=""),
                    Link(url="http://example.com/sample3.html",
                         text="sample 3 text"),
                    Link(
                        url="http://example.com/sample3.html#foo",
                        text="sample 3 repetition with fragment",
                    ),
                    Link(url="http://www.google.com/something", text=""),
                    Link(url="http://example.com/innertag.html",
                         text="inner tag"),
                    Link(url=page4_url, text="href with whitespaces"),
                ],
            )

            lx = self.extractor_cls(attrs=None)
            self.assertEqual(lx.extract_links(self.response), [])
 def test_restrict_xpaths(self):
     lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
     self.assertEqual([link for link in lx.extract_links(self.response)], [
         Link(url='http://example.com/sample1.html', text=u''),
         Link(url='http://example.com/sample2.html', text=u'sample 2')
     ])
예제 #28
0
 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
     links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links,
                      [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
예제 #29
0
 def _add_link(url_sel, alt_sel=None):
     url = flatten([url_sel.extract()])
     alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
     if url:
         ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
예제 #30
0
 def get_url(self):
     return Link(self.select('@href').extract()[0])