Exemplo n.º 1
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            attrib_value = element.attrib.get(attrib_name)

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(attrib_name,
                                                         attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item
Exemplo n.º 2
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            try:
                attrib_value = element.attrib.get(attrib_name)
            except ValueError:
                # lxml.etree.__getNsTag can raise ValueError: Empty tag name
                # https://bugs.python.org/issue28236
                attrib_value = ""

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(
                        attrib_name, attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item
Exemplo n.º 3
0
    def iter_processed_text(self, file, encoding=None, base_url=None):
        for text, is_link in self.iter_text(file, encoding):
            if is_link:
                try:
                    new_text = json.loads('"{0}"'.format(text))
                except ValueError:
                    yield (text, False)
                    continue

                if is_unlikely_link(new_text) or not is_likely_link(new_text):
                    yield (text, False)
                    continue

                if base_url:
                    new_link = urljoin_safe(base_url, new_text, allow_fragments=False)
                else:
                    new_link = new_text

                if new_link:
                    yield (new_link, identify_link_type(new_link) or True)
                else:
                    yield (text, False)
            else:
                yield (text, False)
Exemplo n.º 4
0
 def test_is_likely_link(self):
     self.assertTrue(is_likely_link('image.png'))
     self.assertTrue(is_likely_link('video.mp4'))
     self.assertTrue(is_likely_link('/directory'))
     self.assertTrue(is_likely_link('directory/'))
     self.assertTrue(is_likely_link('/directory/'))
     self.assertTrue(is_likely_link('../directory/'))
     self.assertTrue(is_likely_link('http://example.com/'))
     self.assertTrue(is_likely_link('https://example.com/'))
     self.assertTrue(is_likely_link('ftp://example.com'))
     self.assertTrue(is_likely_link('directory/index.html'))
     self.assertFalse(is_likely_link('directory/another_directory'))
     self.assertTrue(is_likely_link('application/windows.exe'))
     self.assertTrue(is_likely_link('//example.com/admin'))
     self.assertFalse(is_likely_link('12.0'))
     self.assertFalse(is_likely_link('7'))
     self.assertFalse(is_likely_link('horse'))
     self.assertFalse(is_likely_link(''))
     self.assertFalse(is_likely_link('setTimeout(myTimer, 1000)'))
     self.assertFalse(is_likely_link('comment.delete'))
     self.assertFalse(is_likely_link('example.com'))
     self.assertFalse(is_likely_link('example.net'))
     self.assertFalse(is_likely_link('example.org'))
     self.assertFalse(is_likely_link('example.edu'))
Exemplo n.º 5
0
 def test_is_likely_link(self):
     self.assertTrue(is_likely_link('image.png'))
     self.assertTrue(is_likely_link('video.mp4'))
     self.assertTrue(is_likely_link('/directory'))
     self.assertTrue(is_likely_link('directory/'))
     self.assertTrue(is_likely_link('/directory/'))
     self.assertTrue(is_likely_link('../directory/'))
     self.assertTrue(is_likely_link('http://example.com/'))
     self.assertTrue(is_likely_link('https://example.com/'))
     self.assertTrue(is_likely_link('ftp://example.com'))
     self.assertTrue(is_likely_link('directory/index.html'))
     self.assertFalse(is_likely_link('directory/another_directory'))
     self.assertTrue(is_likely_link('application/windows.exe'))
     self.assertTrue(is_likely_link('//example.com/admin'))
     self.assertFalse(is_likely_link('12.0'))
     self.assertFalse(is_likely_link('7'))
     self.assertFalse(is_likely_link('horse'))
     self.assertFalse(is_likely_link(''))
     self.assertFalse(is_likely_link('setTimeout(myTimer, 1000)'))
     self.assertFalse(is_likely_link('comment.delete'))
     self.assertFalse(is_likely_link('example.com'))
     self.assertFalse(is_likely_link('example.net'))
     self.assertFalse(is_likely_link('example.org'))
     self.assertFalse(is_likely_link('example.edu'))