示例#1
0
    def scrape(self, request, response):
        if not self.is_html(request, response):
            return

        content_file = response.body.content_file
        encoding = get_heading_encoding(response)

        tree = self.parse(content_file, encoding, request.url_info.url)
        root = tree.getroot()

        if root is None:
            return

        linked_urls = set()
        inline_urls = set()

        link_infos = self.iter_links(root)

        if 'Refresh' in response.fields:
            link = parse_refresh(response.fields['Refresh'])

            if link:
                link_info = LinkInfo(None, '_refresh', None, to_str(link),
                                     False, True, None, 'refresh')
                link_infos = itertools.chain(link_infos, [link_info])

        for scraped_link in link_infos:
            if self._only_relative:
                if scraped_link.base_link or '://' in scraped_link.link:
                    continue

            if not self._is_accepted(scraped_link.tag):
                continue

            base_url = root.base_url

            if scraped_link.base_link:
                base_url = wpull.url.urljoin(base_url, scraped_link.base_link)

            url = wpull.url.urljoin(base_url,
                                    scraped_link.link,
                                    allow_fragments=False)

            # Browsers seems to tolerate URLs with newlines
            url = url.replace('\n', '').replace('\r', '')

            if scraped_link.inline:
                inline_urls.add(url)
            if scraped_link.linked:
                linked_urls.add(url)

        if self._robots and self._robots_cannot_follow(root):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': to_str(root.base_url),
            'encoding': to_str(root.getroottree().docinfo.encoding),
        }
示例#2
0
    def iter_links_element(cls, element):
        '''Iterate a HTML element.'''
        # reference: lxml.html.HtmlMixin.iterlinks()
        # NOTE: to_str is needed because on Python 2, only byte strings
        # are returned from lxml
        attrib = element.attrib
        tag = element.tag

        if tag == 'link':
            iterable = cls.iter_links_link_element(element)
        elif tag == 'meta':
            iterable = cls.iter_links_meta_element(element)
        elif tag in ('object', 'applet'):
            iterable = cls.iter_links_object_element(element)
        elif tag == 'param':
            iterable = cls.iter_links_param_element(element)
        elif tag == 'style':
            iterable = cls.iter_links_style_element(element)
        else:
            iterable = cls.iter_links_plain_element(element)

        for link_info in iterable:
            yield link_info

        if 'style' in attrib:
            for link in CSSScraper.scrape_urls(attrib['style']):
                yield LinkInfo(element, element.tag, 'style', to_str(link),
                               True, False, None, 'css')
示例#3
0
    def iter_links_element(cls, element):
        '''Iterate a HTML element.'''
        # reference: lxml.html.HtmlMixin.iterlinks()
        # NOTE: to_str is needed because on Python 2, only byte strings
        # are returned from lxml
        attrib = element.attrib
        tag = element.tag

        if tag == 'link':
            iterable = cls.iter_links_link_element(element)
        elif tag == 'meta':
            iterable = cls.iter_links_meta_element(element)
        elif tag in ('object', 'applet'):
            iterable = cls.iter_links_object_element(element)
        elif tag == 'param':
            iterable = cls.iter_links_param_element(element)
        elif tag == 'style':
            iterable = cls.iter_links_style_element(element)
        else:
            iterable = cls.iter_links_plain_element(element)

        for link_info in iterable:
            yield link_info

        if 'style' in attrib:
            for link in CSSScraper.scrape_urls(attrib['style']):
                yield LinkInfo(
                    element, element.tag, 'style',
                    to_str(link),
                    True, False,
                    None,
                    'css'
                )
示例#4
0
 def iter_links_plain_element(cls, element):
     '''Iterate any element for links using generic rules.'''
     for attrib_name, link in cls.iter_links_by_attrib(element):
         inline = cls.is_link_inline(element.tag, attrib_name)
         linked = cls.is_html_link(element.tag, attrib_name)
         yield LinkInfo(element, element.tag, attrib_name, to_str(link),
                        inline, linked, None, 'plain')
示例#5
0
    def iter_links_param_element(cls, element):
        '''Iterate a ``param`` element.'''
        valuetype = element.get('valuetype', '')

        if valuetype.lower() == 'ref' and 'value' in element.attrib:
            yield LinkInfo(element, element.tag, 'value',
                           to_str(element.get('value')), True, False, None,
                           'plain')
示例#6
0
 def iter_links_style_element(self, element):
     '''Iterate a ``style`` element.'''
     if element.text:
         link_iter = itertools.chain(
             CSSScraper.scrape_imports(element.text),
             CSSScraper.scrape_urls(element.text))
         for link in link_iter:
             yield LinkInfo(element, element.tag, None, to_str(link), True,
                            False, None, 'css')
示例#7
0
    def iter_links_meta_element(cls, element):
        '''Iterate the ``meta`` element for links.

        This function handles refresh URLs.
        '''
        if element.get('http-equiv', '').lower() == 'refresh':
            content_value = element.get('content')
            link = parse_refresh(content_value)
            if link:
                yield LinkInfo(element, element.tag, 'http-equiv',
                               to_str(link), False, True, None, 'refresh')
示例#8
0
    def iter_links_param_element(cls, element):
        '''Iterate a ``param`` element.'''
        valuetype = element.get('valuetype', '')

        if valuetype.lower() == 'ref' and 'value' in element.attrib:
            yield LinkInfo(
                element, element.tag, 'value',
                to_str(element.get('value')),
                True, False,
                None,
                'plain'
            )
示例#9
0
 def iter_links_plain_element(cls, element):
     '''Iterate any element for links using generic rules.'''
     for attrib_name, link in cls.iter_links_by_attrib(element):
         inline = cls.is_link_inline(element.tag, attrib_name)
         linked = cls.is_html_link(element.tag, attrib_name)
         yield LinkInfo(
             element, element.tag, attrib_name,
             to_str(link),
             inline, linked,
             None,
             'plain'
         )
示例#10
0
    def iter_links_link_element(cls, element):
        '''Iterate a ``link`` for URLs.

        This function handles stylesheets and icons in addition to
        standard scraping rules.
        '''
        rel = element.get('rel', '')
        inline = 'stylesheet' in rel or 'icon' in rel

        for attrib_name, link in cls.iter_links_by_attrib(element):
            yield LinkInfo(element, element.tag, attrib_name, to_str(link),
                           inline, not inline, None, 'plain')
示例#11
0
    def iter_links_object_element(cls, element):
        '''Iterate ``object`` and ``embed`` elements.

        This function also looks at ``codebase`` and ``archive`` attributes.
        '''
        base_link = to_str(element.get('codebase', None))

        if base_link:
            # lxml returns codebase as inline
            yield LinkInfo(element, element.tag, 'codebase', base_link, True,
                           False, None, 'plain')

        for attribute in ('code', 'src', 'classid', 'data'):
            if attribute in element.attrib:
                yield LinkInfo(element, element.tag, attribute,
                               to_str(element.get(attribute)), True, False,
                               base_link, 'plain')

        if 'archive' in element.attrib:
            for match in re.finditer(r'[^ ]+', element.get('archive')):
                value = match.group(0)
                yield LinkInfo(element, element.tag, 'archive', to_str(value),
                               True, False, base_link, 'list')
示例#12
0
    def iter_links_object_element(cls, element):
        '''Iterate ``object`` and ``embed`` elements.

        This function also looks at ``codebase`` and ``archive`` attributes.
        '''
        base_link = to_str(element.get('codebase', None))

        if base_link:
            # lxml returns codebase as inline
            yield LinkInfo(
                element, element.tag, 'codebase',
                base_link,
                True, False,
                None,
                'plain'
            )

        for attribute in ('code', 'src', 'classid', 'data'):
            if attribute in element.attrib:
                yield LinkInfo(
                    element, element.tag, attribute,
                    to_str(element.get(attribute)),
                    True, False,
                    base_link,
                    'plain'
                )

        if 'archive' in element.attrib:
            for match in re.finditer(r'[^ ]+', element.get('archive')):
                value = match.group(0)
                yield LinkInfo(
                    element, element.tag, 'archive',
                    to_str(value),
                    True, False,
                    base_link,
                    'list'
                )
示例#13
0
 def iter_links_style_element(self, element):
     '''Iterate a ``style`` element.'''
     if element.text:
         link_iter = itertools.chain(
             CSSScraper.scrape_imports(element.text),
             CSSScraper.scrape_urls(element.text)
         )
         for link in link_iter:
             yield LinkInfo(
                 element, element.tag, None,
                 to_str(link),
                 True, False,
                 None,
                 'css'
             )
示例#14
0
    def iter_links_meta_element(cls, element):
        '''Iterate the ``meta`` element for links.

        This function handles refresh URLs.
        '''
        if element.get('http-equiv', '').lower() == 'refresh':
            content_value = element.get('content')
            link = parse_refresh(content_value)
            if link:
                yield LinkInfo(
                    element, element.tag, 'http-equiv',
                    to_str(link),
                    False, True,
                    None,
                    'refresh'
                )
示例#15
0
    def iter_links_link_element(cls, element):
        '''Iterate a ``link`` for URLs.

        This function handles stylesheets and icons in addition to
        standard scraping rules.
        '''
        rel = element.get('rel', '')
        inline = 'stylesheet' in rel or 'icon' in rel

        for attrib_name, link in cls.iter_links_by_attrib(element):
            yield LinkInfo(
                element, element.tag, attrib_name,
                to_str(link),
                inline, not inline,
                None,
                'plain'
            )
示例#16
0
 def test_to_str(self):
     self.assertEqual('hi', to_str(b'hi'))
     self.assertEqual(['hi'], to_str([b'hi']))
     self.assertEqual({'hi': 'hello'}, to_str({b'hi': b'hello'}))
示例#17
0
    def scrape(self, request, response):
        if not self.is_html(request, response):
            return

        content_file = response.body.content_file
        encoding = get_heading_encoding(response)

        tree = self.parse(content_file, encoding, request.url_info.url)
        root = tree.getroot()

        if root is None:
            return

        linked_urls = set()
        inline_urls = set()

        link_infos = self.iter_links(root)

        if 'Refresh' in response.fields:
            link = parse_refresh(response.fields['Refresh'])

            if link:
                link_info = LinkInfo(
                    None, '_refresh', None,
                    to_str(link),
                    False, True,
                    None, 'refresh'
                )
                link_infos = itertools.chain(link_infos, [link_info])

        for scraped_link in link_infos:
            if self._only_relative:
                if scraped_link.base_link or '://' in scraped_link.link:
                    continue

            if not self._is_accepted(scraped_link.tag):
                continue

            base_url = root.base_url

            if scraped_link.base_link:
                base_url = wpull.url.urljoin(base_url, scraped_link.base_link)

            url = wpull.url.urljoin(base_url, scraped_link.link,
                allow_fragments=False)

            # Browsers seems to tolerate URLs with newlines
            url = url.replace('\n', '').replace('\r', '')

            if scraped_link.inline:
                inline_urls.add(url)
            if scraped_link.linked:
                linked_urls.add(url)

        if self._robots and self._robots_cannot_follow(root):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': to_str(root.base_url),
            'encoding': to_str(root.getroottree().docinfo.encoding),
        }