def substitute_entity(match): ent = match.group(1) if ent in n2cp.keys(): return "&#%i;" % n2cp[ent] else: return match.group(0)
def replace(match): if match.group(1): return unichr(int(match.group(2))) elif match.group(3) in name2codepoint.keys(): return unichr(name2codepoint[match.group(3)]) else: return ""
def __init__(self, html_entities=False): self._elements = {} if html_entities: entity_names = ['&%s;' % name for name in name2codepoint.keys()] else: entity_names = ['&', '>', '<'] self._entities = re.compile('|'.join(entity_names), re.IGNORECASE | re.MULTILINE)
def html2unicode(s): #replace html characters with unicode codepoints keys = name2codepoint.keys() keys = [k for k in keys if k not in ['amp', 'gt', 'lt']] for k in keys: s = s.replace('&%s;' % k,unichr( name2codepoint[k])) return s
def decode_entities(text): ss = u'' i = 0 while 1: m = __regx_reference.search(text, i) if m is None: ss += text[i:] break ss += text[i:m.start()] i = m.end() name = m.group(1) if name in name2codepoint.keys(): ss += unichr(name2codepoint[name]) elif __regx_num16.match(name): ss += unichr(int(u'0' + name[1:], 16)) elif __regx_num10.match(name): ss += unichr(int(name[1:])) return ss
def decode_entities(text): ss = u"" i = 0 while 1: m = __regx_reference.search(text, i) if m is None: ss += text[i:] break ss += text[i : m.start()] i = m.end() name = m.group(1) if name in name2codepoint.keys(): ss += unichr(name2codepoint[name]) elif __regx_num16.match(name): ss += unichr(int(u"0" + name[1:], 16)) elif __regx_num10.match(name): ss += unichr(int(name[1:])) return ss
def match_xpath(self, html): """Search for the content given by the {image,title,body,iframe}-xpath values.""" # remove scripts; they generally can't be parsed as XML html = script_re.sub('', html) # Replace entity references by their corresponding character references. from htmlentitydefs import name2codepoint eref_re = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');') html = eref_re.sub(lambda m: '&#%d;' % name2codepoint[m.group(1)], html) try: root = et.fromstring(html) except Exception as e: sys.stderr.write((u"Page at %s not parseable as XML: %s\n\n----\n%s----\n" % ( self.uri, e, html )).encode(self.encoding)) return def _find(attrname): if attrname not in self.attrs: return None xpath = self.attrs[attrname] # handle text and attribute access which are not done by ElementTree accessf = lambda e: et.tostring(e) # default: element as X(HT)ML if xpath.endswith('/text()'): # text content xpath = xpath[:-7] accessf = lambda e: e.text elif '/@' in xpath: # attribute value xpath, attrname = xpath.split('/@', 1) accessf = lambda e: e.get(attrname) for e in root.iterfind('.' + xpath): return accessf(e) # use only the first match sys.stderr.write((u"%s '%s' not found at %s:\n\n----\n%s----\n" % ( attrname, xpath, self.uri, html )).encode(self.encoding)) self.imgLink = _find('image-xpath') self.itemTitle = _find('title-xpath') self.itemBody = _find('body-xpath') self.iframe = _find('iframe-xpath')
help='tolerant', default=False) parser.add_argument('-v', action='store_true', dest='verbose', help='print lines and linenr', default=False) args = parser.parse_args(sys.argv[1:]) expressions = [] if args.tolerant: # We are looking for damaged entities, i.e. entities where either the # preceding & or the succeeding ; is missing expressions = [ re.compile("(?:[^a-zA-Z])(&%s|%s;)" % (name, name)) for name in name2codepoint.keys() ] else: expressions = [ re.compile("&%s;" % (name)) for name in name2codepoint.keys() ] entity_counts = defaultdict(int) for linenr, line in enumerate(sys.stdin): line = line.strip() #print line for ex in expressions: match = ex.search(line) if match: entity_counts[match.group()] += 1 if args.verbose:
import re from htmlentitydefs import name2codepoint from twisted.python.text import greedyWrap from twisted.web.error import Error as WebError # From http://wiki.python.org/moin/EscapingHtml _HTMLENT_CODEPOINT_RE = re.compile('&({0}|#\d+);'.format( '|'.join(name2codepoint.keys()))) def recodeText(text): """Parses things like & and ὔ into real characters.""" def _entToUnichr(match): ent = match.group(1) try: if ent.startswith("#"): char = unichr(int(ent[1:])) else: char = unichr(name2codepoint[ent]) except: char = match.group(0) return char return _HTMLENT_CODEPOINT_RE.sub(_entToUnichr, text) _whaleFmt = """\ _{lines}__
from htmlentitydefs import name2codepoint import sys, time for c in u'\n'.join([ u'%s: %s' % (i, unichr(name2codepoint[i])) for i in name2codepoint.keys() ]): sys.stdout.write(c) time.sleep(0.03)
from writer import write_numbers if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('-t', action='store_true', dest='tolerant', help='tolerant', default=False) parser.add_argument('-v', action='store_true', dest='verbose', help='print lines and linenr', default=False) args = parser.parse_args(sys.argv[1:]) expressions = [] if args.tolerant: # We are looking for damaged entities, i.e. entities where either the # preceding & or the succeeding ; is missing expressions = [re.compile("(?:[^a-zA-Z])(&%s|%s;)" %(name, name)) for name in name2codepoint.keys()] else: expressions = [re.compile("&%s;" %(name)) for name in name2codepoint.keys()] entity_counts = defaultdict(int) for linenr, line in enumerate(sys.stdin): line = line.strip() #print line for ex in expressions: match = ex.search(line) if match: entity_counts[match.group()] += 1 if args.verbose: print linenr, line # print entity_counts
def codepoint2name(self): result = {} for key in name2codepoint.keys(): result[name2codepoint[key]] = key return result
from operator import itemgetter if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('-t', action='store_true', dest='tolerant', help='tolerant', default=False) parser.add_argument('-v', action='store_true', dest='verbose', help='print lines and linenr', default=False) args = parser.parse_args(sys.argv[1:]) expressions = [] if args.tolerant: # We are looking for damaged entities, i.e. entities where either the # preceding & or the succeeding ; is missing expressions = [re.compile("(?:[^a-zA-Z])(&%s|%s;)" %(name, name)) for name in name2codepoint.keys()] else: expressions = [re.compile("&%s;" %(name)) for name in name2codepoint.keys()] entity_counts = defaultdict(int) for linenr, line in enumerate(sys.stdin): line = line.strip() #print line for ex in expressions: match = ex.search(line) if match: entity_counts[match.group()] += 1 if args.verbose: print linenr, line # print entity_counts