def strip_html(value): ''' >>> strip_html('a<b>b</b>c') 'abc' ''' value = util.compact_html(value) return re.sub('<.+?>', '', value)
def filter(self, value): ''' >>> f = StripHtmlFilter() >>> f.filter(None) is None True >>> f.filter('a<b>b</b>c') 'abc' ''' if value is None: return None value = util.compact_html(value) return re.sub('<.+?>', '', value)
def extract_from_url(url, rule, charset=None, fmt='yaml'): html = util.curl(url, charset=charset) html = util.compact_html(html) return extract(html, rule, fmt=fmt)
def extract_from_url(url, rule, fmt='yaml'): from textminer import util html = requests.get(url).text html = util.compact_html(html) return extract(html, rule, fmt=fmt)