def test_returns_unicode(self): # make sure it always return uncode assert isinstance(remove_entities(b'no entities'), six.text_type) assert isinstance(remove_entities(b'Price: £100!'), six.text_type) assert isinstance(remove_entities(u'no entities'), six.text_type) assert isinstance(remove_entities(u'Price: £100!'), six.text_type)
def test_regular(self): # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities(b'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
def test_illegal_entities(self): self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y')
def test_keep_entities(self): # keep some entities self.assertEqual( remove_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') self.assertEqual( remove_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']), u'<b>Low < High & Medium \xa3 six</b>')
def test_regular(self): # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities(b'As low as £100!'), u'As low as \xa3100!') self.assertEqual( remove_entities( 'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant' ), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant' )
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' """ chunks = _process_markup( region, lambda text: remove_entities(text, encoding=region.htmlpage.encoding), lambda tag: u' ') text = u''.join(chunks) return _WS.sub(u' ', text).strip()
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>") u'The text is here' """ text = remove_entities(region.text_content, encoding=region.htmlpage.encoding) return _WS.sub(u' ', text).strip()
def join_url(base_url, url, encoding): """ Remove leading and trailing whitespace and punctuation join base url and url """ url = url.decode(encoding) return urljoin(base_url, remove_entities(url))
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' """ chunks = _process_markup(region, lambda text: remove_entities(text, encoding=region.htmlpage.encoding), lambda tag: u' ' ) text = u''.join(chunks) return _WS.sub(u' ', text).strip()
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, unicode): return [remove_entities(s, keep=['lt', 'amp']) for s in strings] else: return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def clean_content(self, text): """ Return a string of text cleaned up by tags, entities, escape chars, quotes and spaces """ temp = remove_tags_with_content(text, which_ones=('style', 'script', 'figcaption')) temp = remove_tags(temp) temp = remove_entities(temp) temp = replace_escape_chars(temp) temp = unquote_markup(temp) temp = " ".join(temp.split()) return temp
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin( base_url, remove_entities(clean_link(u.decode(response_encoding)))) links_text = linkre.findall(response_text) urlstext = set([clean_url(url).encode('utf-8') for url in links_text]) return [Link(url, "") for url in urlstext]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): if base_url is None: base_url = urljoin( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([clean_url(url) for url in links_text]) return [Link(url, "") for url in urlstext]
def _has_ajaxcrawlable_meta(text): """ >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>') True >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>") True >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>') False >>> _has_ajaxcrawlable_meta('<html></html>') False """ # Stripping scripts and comments is slow (about 20x slower than # just checking if a string is in text); this is a quick fail-fast # path that should work for most pages. if 'fragment' not in text: return False if 'content' not in text: return False text = _script_re.sub(u'', text) text = _noscript_re.sub(u'', text) text = html.remove_comments(html.remove_entities(text)) return _ajax_crawlable_re.search(text) is not None
def test_illegal_entities(self): self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y')
def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y')
def test_encoding(self): self.assertEqual(remove_entities(b'x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities('no entities'), unicode) assert isinstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual( remove_entities( 'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant' ), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant' ) # keep some entities self.assertEqual( remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
from w3lib.html import remove_entities from urlparse import urljoin def clean_link(link_text): return link_text.strip("\t\r\n '\"") clean_url = lambda base_url, u, response_encoding: urljoin( base_url, remove_entities(clean_link(u.decode(response_encoding))) )
def test_keep_entities(self): # keep some entities self.assertEqual(remove_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') self.assertEqual(remove_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']), u'<b>Low < High & Medium \xa3 six</b>')
def deduplication(arg): """ deduplication the arg. @param: arg:the variable to deduplication if arg is list,then deduplication it and then the new list. if arg is tuple,then deduplication it and then the new tuple. """ if type(arg) is types.ListType: return list(set(arg)) elif type(arg) is types.TupleType: return tuple(set(arg)) return arg def clean_link(link_text): """ Remove leading and trailing whitespace and punctuation """ return link_text.strip("\t\r\n '\"") clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding)))) """ remove leading and trailing whitespace and punctuation and entities from the given text. then join the base_url and the link that extract """
#-*-coding:utf-8-*- from urllib import quote from w3lib.html import remove_entities from w3lib.url import _safe_chars from urlparse import urljoin list_first_item = lambda x:x[0] if x else None def clean_link(link_text): return link_text.strip("\t\r\n '\"") clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding)))) def parse_query_string(query): params = query.split("&") keyvals = [] for param in params: kv = param.split("=") + [None] keyvals.append((kv[0], kv[1])) return keyvals def filter_query(query, remove_re=None, keep_re=None): keyvals = parse_query_string(query) qargs = [] for k, v in keyvals: if remove_re is not None and remove_re.search(k): continue if keep_re is None or keep_re.search(k): qarg = quote(k, _safe_chars)
import re from urlparse import urljoin from w3lib.html import remove_tags, remove_entities, replace_escape_chars from w3lib.url import safe_url_string linkre = re.compile( "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", re.DOTALL | re.IGNORECASE) def clean_link(link_text): """Remove leading and trailing whitespace and punctuation""" return link_text.strip("\t\r\n '\"") base_url="http://www.singeat.com" clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode('utf-8')))) url = "/shop/34566#index=9#pageid=14" print clean_url(url) print safe_url_string(clean_url(url), 'utf-8')
def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities('no entities'), unicode) assert isinstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') # keep some entities self.assertEqual(remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
def cleanHtml(html): return remove_comments(remove_tags(remove_entities(html))).encode('ascii','ignore')
def good_to_int(arg): ret = [] for i in arg: ret.append(int(i[len('\xa0'):])) return ret def clean_link(link_text): """ Remove leading and trailing whitespace and punctuation """ return link_text.strip("\t\r\n '\"") clean_url = lambda base_url, u: urljoin(base_url, remove_entities(clean_link(u) )) """ remove leading and trailing whitespace and punctuation and entities from the given text. then join the base_url and the link that extract """ prefix = 'http://so.gushiwen.org/authors/authorsw_' posfix = '.aspx' def get_author_page(url): v = url[len(prefix):-len(posfix)] a_pos = v.find('A') author = int(v[:a_pos]) page = int(v[a_pos + 1:]) return author, page
deduplication the arg. @param: arg:the variable to deduplication if arg is list,then deduplication it and then the new list. if arg is tuple,then deduplication it and then the new tuple. """ if type(arg) is types.ListType: return list(set(arg)) elif type(arg) is types.TupleType: return tuple(set(arg)) return arg def clean_link(link_text): """ Remove leading and trailing whitespace and punctuation """ return link_text.strip("\t\r\n '\"") clean_url = lambda base_url, u, response_encoding: urljoin( base_url, remove_entities(clean_link(u.decode(response_encoding)))) """ remove leading and trailing whitespace and punctuation and entities from the given text. then join the base_url and the link that extract """
def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities("no entities"), unicode) assert isinstance(remove_entities("Price: £100!"), unicode) # regular conversions self.assertEqual(remove_entities(u"As low as £100!"), u"As low as \xa3100!") self.assertEqual(remove_entities("As low as £100!"), u"As low as \xa3100!") self.assertEqual( remove_entities( "redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant" ), u"redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant", ) # keep some entities self.assertEqual( remove_entities("<b>Low < High & Medium £ six</b>", keep=["lt", "amp"]), u"<b>Low < High & Medium \xa3 six</b>", ) # illegal entities self.assertEqual( remove_entities("a < b &illegal; c � six", remove_illegal=False), u"a < b &illegal; c � six", ) self.assertEqual(remove_entities("a < b &illegal; c � six", remove_illegal=True), u"a < b c six") self.assertEqual(remove_entities("x≤y"), u"x\u2264y") # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities("x™y", encoding="cp1252"), u"x\u2122y") # encoding self.assertEqual(remove_entities("x\x99™™y", encoding="cp1252"), u"x\u2122\u2122\u2122y")