示例#1
0
    def substitute_entity(match):
        ent = match.group(1)
        if ent in n2cp.keys():
            return "&#%i;" % n2cp[ent]

        else:
            return match.group(0)
def replace(match):
    if match.group(1):
        return unichr(int(match.group(2)))
    elif match.group(3) in name2codepoint.keys():
        return unichr(name2codepoint[match.group(3)])
    else:
        return ""
示例#3
0
 def __init__(self, html_entities=False):
     self._elements = {}
     if html_entities:
         entity_names = ['&%s;' % name for name in name2codepoint.keys()]
     else:
         entity_names = ['&', '>', '<']
     self._entities = re.compile('|'.join(entity_names),
                                 re.IGNORECASE | re.MULTILINE)
示例#4
0
def html2unicode(s):
    #replace html characters with unicode codepoints
    keys = name2codepoint.keys()
    keys = [k for k in keys if k not in ['amp', 'gt', 'lt']]
    for k in keys: 
        
        s = s.replace('&%s;' % k,unichr( name2codepoint[k]))
    return s
示例#5
0
def decode_entities(text):
    ss = u''
    i = 0
    while 1:
        m = __regx_reference.search(text, i)
        if m is None:
            ss += text[i:]
            break
        ss += text[i:m.start()]
        i = m.end()
        name = m.group(1)
        if name in name2codepoint.keys():
            ss += unichr(name2codepoint[name])
        elif __regx_num16.match(name):
            ss += unichr(int(u'0' + name[1:], 16))
        elif __regx_num10.match(name):
            ss += unichr(int(name[1:]))
    return ss
示例#6
0
文件: char.py 项目: gotoc/summary
def decode_entities(text):
    ss = u""
    i = 0
    while 1:
        m = __regx_reference.search(text, i)
        if m is None:
            ss += text[i:]
            break
        ss += text[i : m.start()]
        i = m.end()
        name = m.group(1)
        if name in name2codepoint.keys():
            ss += unichr(name2codepoint[name])
        elif __regx_num16.match(name):
            ss += unichr(int(u"0" + name[1:], 16))
        elif __regx_num10.match(name):
            ss += unichr(int(name[1:]))
    return ss
示例#7
0
    def match_xpath(self, html):
        """Search for the content given by the {image,title,body,iframe}-xpath values."""

        # remove scripts; they generally can't be parsed as XML
        html = script_re.sub('', html)

        # Replace entity references by their corresponding character references.
        from htmlentitydefs import name2codepoint
        eref_re = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');')
        html = eref_re.sub(lambda m: '&#%d;' % name2codepoint[m.group(1)], html)

        try:
            root = et.fromstring(html)
        except Exception as e:
            sys.stderr.write((u"Page at %s not parseable as XML: %s\n\n----\n%s----\n" % (
                self.uri, e, html
            )).encode(self.encoding))
            return

        def _find(attrname):
            if attrname not in self.attrs:
                return None
            xpath = self.attrs[attrname]
            # handle text and attribute access which are not done by ElementTree
            accessf = lambda e: et.tostring(e)  # default: element as X(HT)ML
            if xpath.endswith('/text()'):       # text content
                xpath = xpath[:-7]
                accessf = lambda e: e.text
            elif '/@' in xpath:                 # attribute value
                xpath, attrname = xpath.split('/@', 1)
                accessf = lambda e: e.get(attrname)
            for e in root.iterfind('.' + xpath):
                return accessf(e)               # use only the first match
            sys.stderr.write((u"%s '%s' not found at %s:\n\n----\n%s----\n" % (
                attrname, xpath, self.uri, html
            )).encode(self.encoding))

        self.imgLink = _find('image-xpath')
        self.itemTitle = _find('title-xpath')
        self.itemBody = _find('body-xpath')
        self.iframe = _find('iframe-xpath')
示例#8
0
                        help='tolerant',
                        default=False)
    parser.add_argument('-v',
                        action='store_true',
                        dest='verbose',
                        help='print lines and linenr',
                        default=False)
    args = parser.parse_args(sys.argv[1:])

    expressions = []
    if args.tolerant:
        # We are looking for damaged entities, i.e. entities where either the
        # preceding & or the succeeding ; is missing
        expressions = [
            re.compile("(?:[^a-zA-Z])(&%s|%s;)" % (name, name))
            for name in name2codepoint.keys()
        ]
    else:
        expressions = [
            re.compile("&%s;" % (name)) for name in name2codepoint.keys()
        ]

    entity_counts = defaultdict(int)
    for linenr, line in enumerate(sys.stdin):
        line = line.strip()
        #print line
        for ex in expressions:
            match = ex.search(line)
            if match:
                entity_counts[match.group()] += 1
                if args.verbose:
示例#9
0
文件: util.py 项目: olix0r/vtwt
import re
from htmlentitydefs import name2codepoint

from twisted.python.text import greedyWrap
from twisted.web.error import Error as WebError


# From http://wiki.python.org/moin/EscapingHtml

_HTMLENT_CODEPOINT_RE =  re.compile('&({0}|#\d+);'.format(
        '|'.join(name2codepoint.keys())))

def recodeText(text):
    """Parses things like & and ὔ into real characters."""
    def _entToUnichr(match):
        ent = match.group(1)
        try:
            if ent.startswith("#"):
                char = unichr(int(ent[1:]))
            else:
                char = unichr(name2codepoint[ent])
        except:
            char = match.group(0)

        return char

    return _HTMLENT_CODEPOINT_RE.sub(_entToUnichr, text)


_whaleFmt = """\
   _{lines}__
示例#10
0
文件: n2cp.py 项目: gemdude46/1337
from htmlentitydefs import name2codepoint
import sys, time
for c in u'\n'.join([
        u'%s: %s' % (i, unichr(name2codepoint[i]))
        for i in name2codepoint.keys()
]):
    sys.stdout.write(c)
    time.sleep(0.03)
示例#11
0
from writer import write_numbers

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', action='store_true', dest='tolerant',
                        help='tolerant', default=False)
    parser.add_argument('-v', action='store_true', dest='verbose',
                        help='print lines and linenr', default=False)
    args = parser.parse_args(sys.argv[1:])

    expressions = []
    if args.tolerant:
        # We are looking for damaged entities, i.e. entities where either the
        # preceding & or the succeeding ; is missing
        expressions = [re.compile("(?:[^a-zA-Z])(&%s|%s;)" %(name, name)) for name in name2codepoint.keys()]
    else:
        expressions = [re.compile("&%s;" %(name)) for name in name2codepoint.keys()]

    entity_counts = defaultdict(int)
    for linenr, line in enumerate(sys.stdin):
        line = line.strip()
        #print line
        for ex in expressions:
            match = ex.search(line)
            if match:
                entity_counts[match.group()] += 1
                if args.verbose:
                    print linenr, line

    # print entity_counts
示例#12
0
    def codepoint2name(self):
        result = {}
        for key in name2codepoint.keys():
            result[name2codepoint[key]] = key

        return result
示例#13
0
from operator import itemgetter

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', action='store_true', dest='tolerant',
                        help='tolerant', default=False)
    parser.add_argument('-v', action='store_true', dest='verbose',
                        help='print lines and linenr', default=False)
    args = parser.parse_args(sys.argv[1:])

    expressions = []
    if args.tolerant:
        # We are looking for damaged entities, i.e. entities where either the
        # preceding & or the succeeding ; is missing
        expressions = [re.compile("(?:[^a-zA-Z])(&%s|%s;)" %(name, name)) for name in name2codepoint.keys()]
    else:
        expressions = [re.compile("&%s;" %(name)) for name in name2codepoint.keys()]

    entity_counts = defaultdict(int)
    for linenr, line in enumerate(sys.stdin):
        line = line.strip()
        #print line
        for ex in expressions:
            match = ex.search(line)
            if match:
                entity_counts[match.group()] += 1
                if args.verbose:
                    print linenr, line

    # print entity_counts