def test_remove_comments(self): # make sure it always return unicode assert isinstance(remove_comments('without comments'), unicode) assert isinstance(remove_comments('<!-- with comments -->'), unicode) # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') # text with comments self.assertEqual(remove_comments(u'<!--text with comments-->'), u'') self.assertEqual(remove_comments(u'Hello<!--World-->'),u'Hello')
def test_remove_comments(self): # make sure it always return unicode assert isinstance(remove_comments('without comments'), unicode) assert isinstance(remove_comments('<!-- with comments -->'), unicode) # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') # text with comments self.assertEqual(remove_comments(u'<!--text with comments-->'), u'') self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
def _process_markup(region, textf, tagf): fragments = getattr(region, 'parsed_fragments', None) if fragments is None: yield textf(region) return fiter = iter(fragments) for fragment in fiter: if isinstance(fragment, HtmlTag): # skip forward to closing script tags tag = fragment.tag if tag in _TAGS_TO_PURGE: # if opening, keep going until closed if fragment.tag_type == HtmlTagType.OPEN_TAG: for probe in fiter: if isinstance(probe, HtmlTag) and \ probe.tag == tag and \ probe.tag_type == HtmlTagType.CLOSE_TAG: break else: output = tagf(fragment) if output: yield output else: text = region.htmlpage.fragment_data(fragment) text = remove_comments(text) text = textf(text) if text: yield text
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments( remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) for match1 in META_TAG_RE.finditer(body_chunk): params = {} for match2 in META_TAG_ATTRS_RE.finditer(match1.group(1)): params[match2.group("key")] = match2.group("value") if params.get("http-equiv") == "refresh": match = META_CONTENT_RE.search(params.get("content", "")) if match: interval = float(match.group("int")) url = urljoin_rfc(response.url, safe_url_string((match.group("url") or "").strip(' "\''))) _metaref_cache[response] = (interval, url) return (interval, url) _metaref_cache[response] = (None, None) return _metaref_cache[response]