def adapt(self, text, htmlpage=None): if htmlpage is None: return text if text is None: return encoding = getattr(htmlpage, 'encoding', 'utf-8') text = text.encode(encoding) unquoted = unquote_markup(text, encoding=encoding) cleaned = strip_url(disallowed.sub('', unquoted)) base = get_base_url(htmlpage).encode(encoding) base_url = strip_url(unquote_markup(base, encoding=encoding)) joined = urljoin(base_url, cleaned) return safe_download_url(joined)
def __call__(self, values, loader_context=None): values = super(Url, self).__call__(values) urls = [] for value in values: if isinstance(value, (dict, list)): urls.append(value) value = _strip_url(unquote_markup(value)) base = loader_context.get('baseurl', '') urls.append(urljoin(base, value)) return urls
def test_unquote_markup(self): sample_txt1 = u"""<node1>hi, this is sample text with entities: & © <![CDATA[although this is inside a cdata! & "]]></node1>""" sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>' sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4' # make sure it always return unicode assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode) assert isinstance(unquote_markup(sample_txt2), unicode) self.assertEqual( unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1>""") self.assertEqual( unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>') self.assertEqual( unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""" ) self.assertEqual( unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4' )
def test_unquote_markup(self): sample_txt1 = u"""<node1>hi, this is sample text with entities: & © <![CDATA[although this is inside a cdata! & "]]></node1>""" sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>' sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4' # make sure it always return unicode assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode) assert isinstance(unquote_markup(sample_txt2), unicode) self.assertEqual(unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1>""") self.assertEqual(unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>') self.assertEqual(unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""") self.assertEqual(unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin_rfc(get_base_url(htmlpage), text) return safe_download_url(unquote_markup(joined))
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin( get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url( unquote_markup(joined, encoding=htmlpage.encoding))
def adapt(self, text, htmlpage=None): if htmlpage is None: return text text = text.encode(htmlpage.encoding) joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))