Пример #1
0
 def adapt(self, text, htmlpage=None):
     if htmlpage is None:
         return text
     if text is None:
         return
     encoding = getattr(htmlpage, 'encoding', 'utf-8')
     text = text.encode(encoding)
     unquoted = unquote_markup(text, encoding=encoding)
     cleaned = strip_url(disallowed.sub('', unquoted))
     base = get_base_url(htmlpage).encode(encoding)
     base_url = strip_url(unquote_markup(base, encoding=encoding))
     joined = urljoin(base_url, cleaned)
     return safe_download_url(joined)
Пример #2
0
 def __call__(self, values, loader_context=None):
     values = super(Url, self).__call__(values)
     urls = []
     for value in values:
         if isinstance(value, (dict, list)):
             urls.append(value)
         value = _strip_url(unquote_markup(value))
         base = loader_context.get('baseurl', '')
         urls.append(urljoin(base, value))
     return urls
Пример #3
0
 def __call__(self, values, loader_context=None):
     values = super(Url, self).__call__(values)
     urls = []
     for value in values:
         if isinstance(value, (dict, list)):
             urls.append(value)
         value = _strip_url(unquote_markup(value))
         base = loader_context.get('baseurl', '')
         urls.append(urljoin(base, value))
     return urls
Пример #4
0
    def test_unquote_markup(self):
        sample_txt1 = u"""<node1>hi, this is sample text with entities: &amp; &copy;
<![CDATA[although this is inside a cdata! &amp; &quot;]]></node1>"""
        sample_txt2 = u'<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>'
        sample_txt3 = u'something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4'

        # make sure it always return unicode
        assert isinstance(unquote_markup(sample_txt1.encode('latin-1')),
                          unicode)
        assert isinstance(unquote_markup(sample_txt2), unicode)

        self.assertEqual(
            unquote_markup(sample_txt1),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(
            unquote_markup(sample_txt2),
            u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(
            unquote_markup(sample_txt1 + sample_txt2),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>"""
        )

        self.assertEqual(
            unquote_markup(sample_txt3),
            u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4'
        )
Пример #5
0
    def test_unquote_markup(self):
        sample_txt1 = u"""<node1>hi, this is sample text with entities: &amp; &copy;
<![CDATA[although this is inside a cdata! &amp; &quot;]]></node1>"""
        sample_txt2 = u'<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>'
        sample_txt3 = u'something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4'

        # make sure it always return unicode
        assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode)
        assert isinstance(unquote_markup(sample_txt2), unicode)

        self.assertEqual(unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>""")

        self.assertEqual(unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
Пример #6
0
 def adapt(self, text, htmlpage):
     text = text.encode(htmlpage.encoding)
     joined = urljoin_rfc(get_base_url(htmlpage), text)
     return safe_download_url(unquote_markup(joined))
Пример #7
0
 def adapt(self, text, htmlpage):
     text = text.encode(htmlpage.encoding)
     joined = urljoin(
         get_base_url(htmlpage).encode(htmlpage.encoding), text)
     return safe_download_url(
         unquote_markup(joined, encoding=htmlpage.encoding))
Пример #8
0
 def adapt(self, text, htmlpage=None):
     if htmlpage is None:
         return text
     text = text.encode(htmlpage.encoding)
     joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text)
     return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))