Пример #1
0
 def build_ahc(self):
     if len(self.anchors) > 6:
         self.logger.warn("More than six anchors in file %r. "
                          "Some links may not work properly." %
                          self.item.href)
     data = StringIO()
     data.write(codepoint_to_chr(len(self.anchors)).encode('utf-8'))
     for anchor, offset in self.anchors:
         data.write(codepoint_to_chr(len(anchor)).encode('utf-8'))
         data.write(anchor)
         data.write(pack('<I', offset))
     return data.getvalue()
Пример #2
0
 def dump_hex(self, src, length=16):
     ''' Diagnostic '''
     FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
     N=0
     result=''
     while src:
         s,src = src[:length],src[length:]
         hexa = ' '.join(["%02X"%ord(x) for x in s])
         s = s.translate(FILTER)
         result += "%04X   %-*s   %s\n" % (N, length*3, hexa, s)
         N+=length
     print(result)
Пример #3
0
    def get_tweaks_docs(self):
        path = self.a(self.j(self.SRC, '..', 'resources', 'default_tweaks.py'))
        with open(path, 'rb') as f:
            raw = f.read().decode('utf-8')
        msgs = []
        lines = list(raw.splitlines())
        for i, line in enumerate(lines):
            if line.startswith('#:'):
                msgs.append((i, line[2:].strip()))
                j = i
                block = []
                while True:
                    j += 1
                    line = lines[j]
                    if not line.startswith('#'):
                        break
                    block.append(line[1:].strip())
                if block:
                    msgs.append((i+1, '\n'.join(block)))

        ans = []
        for lineno, msg in msgs:
            ans.append('#: %s:%d'%(path, lineno))
            slash = codepoint_to_chr(92)
            msg = msg.replace(slash, slash*2).replace('"', r'\"').replace('\n',
                    r'\n').replace('\r', r'\r').replace('\t', r'\t')
            ans.append('msgid "%s"'%msg)
            ans.append('msgstr ""')
            ans.append('')

        return '\n'.join(ans)
Пример #4
0
    def get_tweaks_docs(self):
        path = self.a(self.j(self.SRC, '..', 'resources', 'default_tweaks.py'))
        with open(path, 'rb') as f:
            raw = f.read().decode('utf-8')
        msgs = []
        lines = list(raw.splitlines())
        for i, line in enumerate(lines):
            if line.startswith('#:'):
                msgs.append((i, line[2:].strip()))
                j = i
                block = []
                while True:
                    j += 1
                    line = lines[j]
                    if not line.startswith('#'):
                        break
                    block.append(line[1:].strip())
                if block:
                    msgs.append((i+1, '\n'.join(block)))

        ans = []
        for lineno, msg in msgs:
            ans.append('#: %s:%d'%(path, lineno))
            slash = codepoint_to_chr(92)
            msg = msg.replace(slash, slash*2).replace('"', r'\"').replace('\n',
                    r'\n').replace('\r', r'\r').replace('\t', r'\t')
            ans.append('msgid "%s"'%msg)
            ans.append('msgstr ""')
            ans.append('')

        return '\n'.join(ans)
Пример #5
0
def do_map(m, points):
    base = 0xf000
    limit = len(m) + base
    for p in points:
        if base < p < limit:
            yield m[p - base]
        else:
            yield codepoint_to_chr(p)
Пример #6
0
def do_map(m, points):
    base = 0xf000
    limit = len(m) + base
    for p in points:
        if base < p < limit:
            yield m[p - base]
        else:
            yield codepoint_to_chr(p)
Пример #7
0
 def __unicode_process(self, token):
     # change scope in
     if token == r'\{':
         self.__uc_value.append(self.__uc_value[-1])
         # basic error handling
         self.__reini_utf8_counters()
         return token
     # change scope out
     elif token == r'\}':
         self.__uc_value.pop()
         self.__reini_utf8_counters()
         return token
     # add a uc control
     elif token[:3] == r'\uc':
         self.__uc_value[-1] = int(token[3:])
         self.__reini_utf8_counters()
         return token
     # bin data to slip
     elif self.__uc_bin:
         self.__uc_bin = False
         return ''
     # uc char to remove
     elif self.__uc_char:
         # handle \bin tag in case of uc char to skip
         if token[:4] == '\bin':
             self.__uc_char -= 1
             self.__uc_bin = True
             return ''
         elif token[:1] == "\\":
             self.__uc_char -= 1
             return ''
         else:
             return self.__remove_uc_chars(0, token)
     # go for real \u token
     match_obj = self.__utf_exp.match(token)
     if match_obj is not None:
         self.__reini_utf8_counters()
         # get value and handle negative case
         uni_char = int(match_obj.group(1))
         uni_len = len(match_obj.group(0))
         if uni_char < 0:
             uni_char += 65536
         uni_char = codepoint_to_chr(uni_char).encode(
             'ascii', 'xmlcharrefreplace')
         self.__uc_char = self.__uc_value[-1]
         # there is only an unicode char
         if len(token) <= uni_len:
             return uni_char
         # an unicode char and something else
         # must be after as it is splited on \
         # necessary? maybe for \bin?
         elif not self.__uc_char:
             return uni_char + token[uni_len:]
         # if not uc0 and chars
         else:
             return uni_char + self.__remove_uc_chars(uni_len, token)
     # default
     return token
Пример #8
0
 def __unicode_process(self, token):
     # change scope in
     if token == r'\{':
         self.__uc_value.append(self.__uc_value[-1])
         # basic error handling
         self.__reini_utf8_counters()
         return token
     # change scope out
     elif token == r'\}':
         self.__uc_value.pop()
         self.__reini_utf8_counters()
         return token
     # add a uc control
     elif token[:3] == '\\uc':
         self.__uc_value[-1] = int(token[3:])
         self.__reini_utf8_counters()
         return token
     # bin data to slip
     elif self.__uc_bin:
         self.__uc_bin = False
         return ''
     # uc char to remove
     elif self.__uc_char:
         # handle \bin tag in case of uc char to skip
         if token[:4] == '\bin':
             self.__uc_char -=1
             self.__uc_bin = True
             return ''
         elif token[:1] == "\\" :
             self.__uc_char -=1
             return ''
         else:
             return self.__remove_uc_chars(0, token)
     # go for real \u token
     match_obj = self.__utf_exp.match(token)
     if match_obj is not None:
         self.__reini_utf8_counters()
         # get value and handle negative case
         uni_char = int(match_obj.group(1))
         uni_len = len(match_obj.group(0))
         if uni_char < 0:
             uni_char += 65536
         uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
         self.__uc_char = self.__uc_value[-1]
         # there is only an unicode char
         if len(token)<= uni_len:
             return uni_char
         # an unicode char and something else
         # must be after as it is splited on \
         # necessary? maybe for \bin?
         elif not self.__uc_char:
             return uni_char + token[uni_len:]
         # if not uc0 and chars
         else:
             return uni_char + self.__remove_uc_chars(uni_len, token)
     # default
     return token
Пример #9
0
 def write(self, *values):
     for value in values:
         if isinstance(value, numbers.Integral):
             try:
                 value = codepoint_to_chr(value)
             except OverflowError:
                 self.logger.warn('unicode_type overflow for integer:', value)
                 value = u'?'
         self.buf.write(value.encode('utf-8'))
Пример #10
0
 def pdf_serialize(self, stream):
     raw = self.encode('ascii')
     if len(raw) > 126:
         raise ValueError('Name too long: %r'%self)
     raw = bytearray(raw)
     sharp = ord(b'#')
     buf = (
         codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
         '#{:x}'.format(x).encode('ascii') for x in raw)
     stream.write(b'/'+b''.join(buf))
Пример #11
0
def entityref(c):
    if not UNICODE_SNOB and c in unifiable.keys():
        return unifiable[c]
    else:
        try:
            name2cp(c)
        except KeyError:
            return "&" + c
        else:
            return codepoint_to_chr(name2cp(c))
Пример #12
0
 def pdf_serialize(self, stream):
     raw = self.encode('ascii')
     if len(raw) > 126:
         raise ValueError('Name too long: %r'%self)
     raw = bytearray(raw)
     sharp = ord(b'#')
     buf = (
         codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
         '#{:x}'.format(x).encode('ascii') for x in raw)
     stream.write(b'/'+b''.join(buf))
Пример #13
0
def charref(name):
    if name[0] in ['x', 'X']:
        c = int(name[1:], 16)
    else:
        c = int(name)

    if not UNICODE_SNOB and c in unifiable_n.keys():
        return unifiable_n[c]
    else:
        return codepoint_to_chr(c)
Пример #14
0
def escape_funcs():
    global escape, unescape
    if escape is None:
        escapem = {('\\' + x):codepoint_to_chr(i+1) for i, x in enumerate('\\${}')}
        escape_pat = re.compile('|'.join(map(re.escape, escapem)))
        escape = lambda x: escape_pat.sub(lambda m: escapem[m.group()], x.replace(r'\\', '\x01'))
        unescapem = {v:k[1] for k, v in iteritems(escapem)}
        unescape_pat = re.compile('|'.join(unescapem))
        unescape = lambda x:unescape_pat.sub(lambda m:unescapem[m.group()], x)
    return escape, unescape
Пример #15
0
def escape_funcs():
    global escape, unescape
    if escape is None:
        escapem = {('\\' + x):codepoint_to_chr(i+1) for i, x in enumerate('\\${}')}
        escape_pat = re.compile('|'.join(map(re.escape, escapem)))
        escape = lambda x: escape_pat.sub(lambda m: escapem[m.group()], x.replace(r'\\', '\x01'))
        unescapem = {v:k[1] for k, v in iteritems(escapem)}
        unescape_pat = re.compile('|'.join(unescapem))
        unescape = lambda x:unescape_pat.sub(lambda m:unescapem[m.group()], x)
    return escape, unescape
Пример #16
0
 def _build_manifest(self):
     states = ['linear', 'nonlinear', 'css', 'images']
     manifest = dict((state, []) for state in states)
     for item in self._oeb.manifest.values():
         if item.spine_position is not None:
             key = 'linear' if item.linear else 'nonlinear'
             manifest[key].append(item)
         elif item.media_type in OEB_STYLES:
             manifest['css'].append(item)
         elif item.media_type in LIT_IMAGES:
             manifest['images'].append(item)
     data = StringIO()
     data.write(pack('<Bc', 1, '\\'))
     offset = 0
     for state in states:
         items = manifest[state]
         items.sort()
         data.write(pack('<I', len(items)))
         for item in items:
             id, media_type = item.id, item.media_type
             if media_type in OEB_DOCS:
                 # Needs to have 'html' in media-type
                 media_type = XHTML_MIME
             elif media_type in OEB_STYLES:
                 media_type = CSS_MIME
             href = urlunquote(item.href)
             item.offset = offset \
                 if state in ('linear', 'nonlinear') else 0
             data.write(pack('<I', item.offset))
             entry = [
                 codepoint_to_chr(len(id)),
                 unicode_type(id),
                 codepoint_to_chr(len(href)),
                 unicode_type(href),
                 codepoint_to_chr(len(media_type)),
                 unicode_type(media_type)
             ]
             for value in entry:
                 data.write(value.encode('utf-8'))
             data.write('\0')
             offset += item.size
     self._add_file('/manifest', data.getvalue())
Пример #17
0
 def fixup(m, rm=rm, rchar=rchar):
     text = m.group(0)
     if text[:2] == "&#":
         # character reference
         try:
             if text[:3] == "&#x":
                 return codepoint_to_chr(int(text[3:-1], 16))
             else:
                 return codepoint_to_chr(int(text[2:-1]))
         except ValueError:
             pass
     else:
         # named entity
         try:
             text = codepoint_to_chr(name2codepoint[text[1:-1]])
         except KeyError:
             pass
     if rm:
         return rchar  # replace by char
     return text  # leave as is
Пример #18
0
 def fixup(m, rm=rm, rchar=rchar):
     text = m.group(0)
     if text[:2] == "&#":
         # character reference
         try:
             if text[:3] == "&#x":
                 return codepoint_to_chr(int(text[3:-1], 16))
             else:
                 return codepoint_to_chr(int(text[2:-1]))
         except ValueError:
             pass
     else:
         # named entity
         try:
             text = codepoint_to_chr(name2codepoint[text[1:-1]])
         except KeyError:
             pass
     if rm:
         return rchar  # replace by char
     return text  # leave as is
Пример #19
0
 def mkitaiji(self, src, dst):
     dic = {}
     for line in open(src, "rb"):
         line = line.decode('utf-8').strip()
         if line.startswith(';;'):  # skip comment
             continue
         if re.match(r"^$",line):
             continue
         pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:codepoint_to_chr(int(x.group(1),16)), line)
         dic[pair[0]] = pair[1]
     from calibre.utils.serialize import msgpack_dumps
     with open(dst, 'wb') as f:
         f.write(msgpack_dumps(dic))
Пример #20
0
 def create_sfnt(self, text_item):
     get_table = partial(self.qt_hack.get_sfnt_table, text_item)
     try:
         ans = Font(Sfnt(get_table))
     except UnsupportedFont as e:
         raise UnsupportedFont('The font %s is not a valid sfnt. Error: %s'%(
             text_item.font().family(), e))
     glyph_map = self.qt_hack.get_glyph_map(text_item)
     gm = {}
     ans.ignore_glyphs = set()
     for uc, glyph_id in enumerate(glyph_map):
         if glyph_id not in gm:
             gm[glyph_id] = codepoint_to_chr(uc)
             if uc in (0xad, 0x200b):
                 ans.ignore_glyphs.add(glyph_id)
     ans.full_glyph_map = gm
     return ans
Пример #21
0
    def __init__(self, metrics, num, objects, compress):
        self.metrics, self.compress = metrics, compress
        self.is_otf = self.metrics.is_otf
        self.subset_tag = str(
            re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
        )).rjust(6, 'A')
        self.font_stream = FontStream(metrics.is_otf, compress=compress)
        try:
            psname = metrics.postscript_name
        except Exception:
            psname = uuid4()
        self.font_descriptor = Dictionary({
            'Type': Name('FontDescriptor'),
            'FontName': Name('%s+%s'%(self.subset_tag, psname)),
            'Flags': 0b100,  # Symbolic font
            'FontBBox': Array(metrics.pdf_bbox),
            'ItalicAngle': metrics.post.italic_angle,
            'Ascent': metrics.pdf_ascent,
            'Descent': metrics.pdf_descent,
            'CapHeight': metrics.pdf_capheight,
            'AvgWidth': metrics.pdf_avg_width,
            'StemV': metrics.pdf_stemv,
        })
        self.descendant_font = Dictionary({
            'Type':Name('Font'),
            'Subtype':Name('CIDFontType' + ('0' if metrics.is_otf else '2')),
            'BaseFont': self.font_descriptor['FontName'],
            'FontDescriptor':objects.add(self.font_descriptor),
            'CIDSystemInfo':Dictionary({
                'Registry':String('Adobe'),
                'Ordering':String('Identity'),
                'Supplement':0,
            }),
        })
        if not self.is_otf:
            self.descendant_font['CIDToGIDMap'] = Name('Identity')

        self.font_dict = Dictionary({
            'Type':Name('Font'),
            'Subtype':Name('Type0'),
            'Encoding':Name('Identity-H'),
            'BaseFont':self.descendant_font['BaseFont'],
            'DescendantFonts':Array([objects.add(self.descendant_font)]),
        })

        self.used_glyphs = set()
Пример #22
0
 def create_sfnt(self, text_item):
     get_table = partial(self.qt_hack.get_sfnt_table, text_item)
     try:
         ans = Font(Sfnt(get_table))
     except UnsupportedFont as e:
         raise UnsupportedFont(
             'The font %s is not a valid sfnt. Error: %s' %
             (text_item.font().family(), e))
     glyph_map = self.qt_hack.get_glyph_map(text_item)
     gm = {}
     ans.ignore_glyphs = set()
     for uc, glyph_id in enumerate(glyph_map):
         if glyph_id not in gm:
             gm[glyph_id] = codepoint_to_chr(uc)
             if uc in (0xad, 0x200b):
                 ans.ignore_glyphs.add(glyph_id)
     ans.full_glyph_map = gm
     return ans
Пример #23
0
def read_utf8_char(bytes, pos):
    c = ord(bytes[pos:pos+1])
    mask = 0x80
    if (c & mask):
        elsize = 0
        while c & mask:
            mask >>= 1
            elsize += 1
        if (mask <= 1) or (mask == 0x40):
            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
    else:
        elsize = 1
    if elsize > 1:
        if elsize + pos > len(bytes):
            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
        c &= (mask - 1)
        for i in range(1, elsize):
            b = ord(bytes[pos+i:pos+i+1])
            if (b & 0xC0) != 0x80:
                raise LitError(
                    'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i]))
            c = (c << 6) | (b & 0x3F)
    return codepoint_to_chr(c), pos+elsize
Пример #24
0
def read_utf8_char(bytes, pos):
    c = ord(bytes[pos])
    mask = 0x80
    if (c & mask):
        elsize = 0
        while c & mask:
            mask >>= 1
            elsize += 1
        if (mask <= 1) or (mask == 0x40):
            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
    else:
        elsize = 1
    if elsize > 1:
        if elsize + pos > len(bytes):
            raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
        c &= (mask - 1)
        for i in range(1, elsize):
            b = ord(bytes[pos + i])
            if (b & 0xC0) != 0x80:
                raise LitError('Invalid UTF8 character: %s' %
                               repr(bytes[pos:pos + i]))
            c = (c << 6) | (b & 0x3F)
    return codepoint_to_chr(c), pos + elsize
Пример #25
0
    def binary_to_text_inner(self, bin, buf, stack):
        (depth, tag_name, current_map, dynamic_tag, errors,
                in_censorship, is_goingdown, state, flags) = stack.pop()

        if state == 'close tag':
            if not tag_name:
                raise LitError('Tag ends before it begins.')
            buf.write(encode(u''.join(('</', tag_name, '>'))))
            dynamic_tag = 0
            tag_name = None
            state = 'text'

        while self.cpos < len(bin):
            c, self.cpos = read_utf8_char(bin, self.cpos)
            oc = ord(c)

            if state == 'text':
                if oc == 0:
                    state = 'get flags'
                    continue
                elif c == '\v':
                    c = '\n'
                elif c == '>':
                    c = '>>'
                elif c == '<':
                    c = '<<'
                buf.write(encode(c))

            elif state == 'get flags':
                if oc == 0:
                    state = 'text'
                    continue
                flags = oc
                state = 'get tag'

            elif state == 'get tag':
                state = 'text' if oc == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    tag = oc
                    buf.write(b'<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = True
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        if not self.tag_atoms or tag not in self.tag_atoms:
                            raise LitError(
                                "atom tag %d not in atom tag list" % tag)
                        tag_name = self.tag_atoms[tag]
                        current_map = self.attr_atoms
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?'+codepoint_to_chr(tag)+'?'
                        current_map = self.tag_to_attr_map[tag]
                        print('WARNING: tag %s unknown' % codepoint_to_chr(tag))
                    buf.write(encode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitError('Extra closing tag %s at %d'%(tag_name,
                            self.cpos))
                    break

            elif state == 'get attr':
                in_censorship = False
                if oc == 0:
                    state = 'text'
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        buf.write(b' />')
                    else:
                        buf.write(b'>')
                        frame = (depth, tag_name, current_map,
                            dynamic_tag, errors, in_censorship, False,
                            'close tag', flags)
                        stack.append(frame)
                        frame = (depth+1, None, None, 0, 0,
                                False, False, 'text', 0)
                        stack.append(frame)
                        break
                else:
                    if oc == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if current_map and oc in current_map and current_map[oc]:
                        attr = current_map[oc]
                    elif oc in self.attr_map:
                        attr = self.attr_map[oc]
                    if not attr or not isinstance(attr, string_or_bytes):
                        raise LitError(
                            'Unknown attribute %d in tag %s' % (oc, tag_name))
                    if attr.startswith('%'):
                        in_censorship = True
                        state = 'get value length'
                        continue
                    buf.write(b' ' + encode(attr) + b'=')
                    if attr in ['href', 'src']:
                        state = 'get href length'
                    else:
                        state = 'get value length'

            elif state == 'get value length':
                if not in_censorship:
                    buf.write(b'"')
                count = oc - 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'
                    continue
                state = 'get value'
                if oc == 0xffff:
                    continue
                if count < 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)

            elif state == 'get value':
                if count == 0xfffe:
                    if not in_censorship:
                        buf.write(encode('%s"' % (oc - 1)))
                    in_censorship = False
                    state = 'get attr'
                elif count > 0:
                    if not in_censorship:
                        if c == '"':
                            c = '&quot;'
                        elif c == '<':
                            c = '&lt;'
                        if isinstance(c, unicode_type):
                            c = c.encode('ascii', 'xmlcharrefreplace')
                        buf.write(c)
                    count -= 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'

            elif state == 'get custom length':
                count = oc - 1
                if count <= 0 or count > len(bin)-self.cpos:
                    raise LitError('Invalid character count %d' % count)
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''

            elif state == 'get custom':
                tag_name += c
                count -= 1
                if count == 0:
                    buf.write(encode(tag_name))
                    state = 'get attr'

            elif state == 'get attr length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                buf.write(b' ')
                state = 'get custom attr'

            elif state == 'get custom attr':
                buf.write(encode(c))
                count -= 1
                if count == 0:
                    buf.write(b'=')
                    state = 'get value length'

            elif state == 'get href length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                href = ''
                state = 'get href'

            elif state == 'get href':
                href += c
                count -= 1
                if count == 0:
                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
                    if frag:
                        path = '#'.join((path, frag))
                    path = urlnormalize(path)
                    buf.write(encode(u'"%s"' % path))
                    state = 'get attr'
Пример #26
0
 def uni(match):
     try:
         return codepoint_to_chr(int(match.group(1)))
     except Exception:
         return '?'
Пример #27
0
    def binary_to_text_inner(self, bin, buf, stack):
        (depth, tag_name, current_map, dynamic_tag, errors, in_censorship,
         is_goingdown, state, flags) = stack.pop()

        if state == 'close tag':
            if not tag_name:
                raise LitError('Tag ends before it begins.')
            buf.write(encode(u''.join(('</', tag_name, '>'))))
            dynamic_tag = 0
            tag_name = None
            state = 'text'

        while self.cpos < len(bin):
            c, self.cpos = read_utf8_char(bin, self.cpos)
            oc = ord(c)

            if state == 'text':
                if oc == 0:
                    state = 'get flags'
                    continue
                elif c == '\v':
                    c = '\n'
                elif c == '>':
                    c = '>>'
                elif c == '<':
                    c = '<<'
                buf.write(encode(c))

            elif state == 'get flags':
                if oc == 0:
                    state = 'text'
                    continue
                flags = oc
                state = 'get tag'

            elif state == 'get tag':
                state = 'text' if oc == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    tag = oc
                    buf.write('<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = True
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        if not self.tag_atoms or tag not in self.tag_atoms:
                            raise LitError("atom tag %d not in atom tag list" %
                                           tag)
                        tag_name = self.tag_atoms[tag]
                        current_map = self.attr_atoms
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?' + codepoint_to_chr(tag) + '?'
                        current_map = self.tag_to_attr_map[tag]
                        print('WARNING: tag %s unknown' %
                              codepoint_to_chr(tag))
                    buf.write(encode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitError('Extra closing tag %s at %d' %
                                       (tag_name, self.cpos))
                    break

            elif state == 'get attr':
                in_censorship = False
                if oc == 0:
                    state = 'text'
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        buf.write(' />')
                    else:
                        buf.write('>')
                        frame = (depth, tag_name, current_map, dynamic_tag,
                                 errors, in_censorship, False, 'close tag',
                                 flags)
                        stack.append(frame)
                        frame = (depth + 1, None, None, 0, 0, False, False,
                                 'text', 0)
                        stack.append(frame)
                        break
                else:
                    if oc == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if current_map and oc in current_map and current_map[oc]:
                        attr = current_map[oc]
                    elif oc in self.attr_map:
                        attr = self.attr_map[oc]
                    if not attr or not isinstance(attr, string_or_bytes):
                        raise LitError('Unknown attribute %d in tag %s' %
                                       (oc, tag_name))
                    if attr.startswith('%'):
                        in_censorship = True
                        state = 'get value length'
                        continue
                    buf.write(' ' + encode(attr) + '=')
                    if attr in ['href', 'src']:
                        state = 'get href length'
                    else:
                        state = 'get value length'

            elif state == 'get value length':
                if not in_censorship:
                    buf.write('"')
                count = oc - 1
                if count == 0:
                    if not in_censorship:
                        buf.write('"')
                    in_censorship = False
                    state = 'get attr'
                    continue
                state = 'get value'
                if oc == 0xffff:
                    continue
                if count < 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)

            elif state == 'get value':
                if count == 0xfffe:
                    if not in_censorship:
                        buf.write('%s"' % (oc - 1))
                    in_censorship = False
                    state = 'get attr'
                elif count > 0:
                    if not in_censorship:
                        if c == '"':
                            c = '&quot;'
                        elif c == '<':
                            c = '&lt;'
                        buf.write(c.encode('ascii', 'xmlcharrefreplace'))
                    count -= 1
                if count == 0:
                    if not in_censorship:
                        buf.write('"')
                    in_censorship = False
                    state = 'get attr'

            elif state == 'get custom length':
                count = oc - 1
                if count <= 0 or count > len(bin) - self.cpos:
                    raise LitError('Invalid character count %d' % count)
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''

            elif state == 'get custom':
                tag_name += c
                count -= 1
                if count == 0:
                    buf.write(encode(tag_name))
                    state = 'get attr'

            elif state == 'get attr length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                buf.write(' ')
                state = 'get custom attr'

            elif state == 'get custom attr':
                buf.write(encode(c))
                count -= 1
                if count == 0:
                    buf.write('=')
                    state = 'get value length'

            elif state == 'get href length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                href = ''
                state = 'get href'

            elif state == 'get href':
                href += c
                count -= 1
                if count == 0:
                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
                    if frag:
                        path = '#'.join((path, frag))
                    path = urlnormalize(path)
                    buf.write(encode(u'"%s"' % path))
                    state = 'get attr'
Пример #28
0
import re, socket

from mechanize import URLError

from calibre.ebooks.metadata.book.base import Metadata
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode
from polyglot.builtins import codepoint_to_chr, unicode_type, range
from polyglot.urllib import parse_qs, quote_plus

URL = \
"http://ww2.kdl.org/libcat/WhatsNext.asp?AuthorLastName={0}&AuthorFirstName=&SeriesName=&BookTitle={1}&CategoryID=0&cmdSearch=Search&Search=1&grouping="

_ignore_starts = u'\'"'+u''.join(codepoint_to_chr(x) for x in list(range(0x2018, 0x201e))+[0x2032, 0x2033])


def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode_type):
        title = title.encode('utf-8')

    title = quote_plus(title)

    author = authors[0].strip()
Пример #29
0
def _replace_unicode(match):
    codepoint = int(match.group(1), 16)
    if codepoint > sys.maxunicode:
        codepoint = 0xFFFD
    return codepoint_to_chr(codepoint)
Пример #30
0
class Parser(object):

    def __init__(self):
        self.current_token = 0
        self.tokens = None

    OPCODE = 1
    WORD = 2
    QUOTED_WORD = 3
    EOF = 4
    REPLACEMENTS = tuple(('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()'))

    # Had to translate named constants to numeric values
    lex_scanner = re.Scanner([
            (r'[()]', lambda x,t: (Parser.OPCODE, t)),
            (r'@.+?:[^")\s]+', lambda x,t: (Parser.WORD, unicode_type(t))),
            (r'[^"()\s]+', lambda x,t: (Parser.WORD, unicode_type(t))),
            (r'".*?((?<!\\)")', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])),
            (r'\s+',              None)
    ], flags=re.DOTALL)

    def token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return res

    def lcase_token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return icu_lower(res)

    def token_type(self):
        if self.is_eof():
            return self.EOF
        return self.tokens[self.current_token][0]

    def is_eof(self):
        return self.current_token >= len(self.tokens)

    def advance(self):
        self.current_token += 1

    def tokenize(self, expr):
        # Strip out escaped backslashes, quotes and parens so that the
        # lex scanner doesn't get confused. We put them back later.
        for k, v in self.REPLACEMENTS:
            expr = expr.replace(k, v)
        tokens = self.lex_scanner.scan(expr)[0]

        def unescape(x):
            for k, v in self.REPLACEMENTS:
                x = x.replace(v, k[1:])
            return x

        return [
            (tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv)
            for tt, tv in tokens
        ]

    def parse(self, expr, locations):
        self.locations = locations
        self.tokens = self.tokenize(expr)
        self.current_token = 0
        prog = self.or_expression()
        if not self.is_eof():
            raise ParseException(_('Extra characters at end of search'))
        return prog

    def or_expression(self):
        lhs = self.and_expression()
        if self.lcase_token() == 'or':
            self.advance()
            return ['or', lhs, self.or_expression()]
        return lhs

    def and_expression(self):
        lhs = self.not_expression()
        if self.lcase_token() == 'and':
            self.advance()
            return ['and', lhs, self.and_expression()]

        # Account for the optional 'and'
        if ((self.token_type() in [self.WORD, self.QUOTED_WORD] or self.token() == '(') and self.lcase_token() != 'or'):
            return ['and', lhs, self.and_expression()]
        return lhs

    def not_expression(self):
        if self.lcase_token() == 'not':
            self.advance()
            return ['not', self.not_expression()]
        return self.location_expression()

    def location_expression(self):
        if self.token_type() == self.OPCODE and self.token() == '(':
            self.advance()
            res = self.or_expression()
            if self.token_type() != self.OPCODE or self.token(advance=True) != ')':
                raise ParseException(_('missing )'))
            return res
        if self.token_type() not in (self.WORD, self.QUOTED_WORD):
            raise ParseException(_('Invalid syntax. Expected a lookup name or a word'))

        return self.base_token()

    def base_token(self):
        if self.token_type() == self.QUOTED_WORD:
            return ['token', 'all', self.token(advance=True)]

        words = self.token(advance=True).split(':')

        # The complexity here comes from having colon-separated search
        # values. That forces us to check that the first "word" in a colon-
        # separated group is a valid location. If not, then the token must
        # be reconstructed. We also have the problem that locations can be
        # followed by quoted strings that appear as the next token. and that
        # tokens can be a sequence of colons.

        # We have a location if there is more than one word and the first
        # word is in locations. This check could produce a "wrong" answer if
        # the search string is something like 'author: "foo"' because it
        # will be interpreted as 'author:"foo"'. I am choosing to accept the
        # possible error. The expression should be written '"author:" foo'
        if len(words) > 1 and words[0].lower() in self.locations:
            loc = words[0].lower()
            words = words[1:]
            if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
                return ['token', loc, self.token(advance=True)]
            return ['token', icu_lower(loc), ':'.join(words)]

        return ['token', 'all', ':'.join(words)]
Пример #31
0
 def tree_to_binary(self,
                    elem,
                    nsrmap=NSRMAP,
                    parents=[],
                    inhead=False,
                    preserve=False):
     if not isinstance(elem.tag, string_or_bytes):
         # Don't emit any comments or raw entities
         return
     nsrmap = copy.copy(nsrmap)
     attrib = dict(elem.attrib)
     style = self.stylizer.style(elem) if self.stylizer else None
     for key, value in elem.nsmap.items():
         if value not in nsrmap or nsrmap[value] != key:
             xmlns = ('xmlns:' + key) if key else 'xmlns'
             attrib[xmlns] = value
         nsrmap[value] = key
     tag = prefixname(elem.tag, nsrmap)
     tag_offset = self.buf.tell()
     if tag == 'head':
         inhead = True
     flags = FLAG_OPENING
     if not elem.text and len(elem) == 0:
         flags |= FLAG_CLOSING
     if inhead:
         flags |= FLAG_HEAD
     if style and self.is_block(style):
         flags |= FLAG_BLOCK
     self.write(0, flags)
     tattrs = self.tattrs[0]
     if tag in self.tags:
         index = self.tags[tag]
         self.write(index)
         if self.tattrs[index]:
             tattrs = self.tattrs[index]
     else:
         self.write(FLAG_CUSTOM, len(tag) + 1, tag)
     last_break = self.page_breaks[-1][0] if self.page_breaks else None
     if style and last_break != tag_offset \
        and style['page-break-before'] in PAGE_BREAKS:
         self.page_breaks.append((tag_offset, list(parents)))
     for attr, value in attrib.items():
         attr = prefixname(attr, nsrmap)
         if attr in ('href', 'src'):
             value = urlnormalize(value)
             path, frag = urldefrag(value)
             if self.item:
                 path = self.item.abshref(path)
             prefix = codepoint_to_chr(3)
             if path in self.manifest.hrefs:
                 prefix = codepoint_to_chr(2)
                 value = self.manifest.hrefs[path].id
                 if frag:
                     value = '#'.join((value, frag))
             value = prefix + value
         elif attr in ('id', 'name'):
             self.anchors.append((value, tag_offset))
         elif attr.startswith('ms--'):
             attr = '%' + attr[4:]
         elif tag == 'link' and attr == 'type' and value in OEB_STYLES:
             value = CSS_MIME
         if attr in tattrs:
             self.write(tattrs[attr])
         else:
             self.write(FLAG_CUSTOM, len(attr) + 1, attr)
         try:
             self.write(ATTR_NUMBER, int(value) + 1)
         except ValueError:
             self.write(len(value) + 1, value)
     self.write(0)
     old_preserve = preserve
     if style:
         preserve = (style['white-space'] in ('pre', 'pre-wrap'))
     xml_space = elem.get(XML('space'))
     if xml_space == 'preserve':
         preserve = True
     elif xml_space == 'normal':
         preserve = False
     if elem.text:
         if preserve:
             self.write(elem.text)
         elif len(elem) == 0 or not elem.text.isspace():
             self.write(COLLAPSE.sub(' ', elem.text))
         # else: de nada
     parents.append(tag_offset)
     child = cstyle = nstyle = None
     for next in chain(elem, [None]):
         if self.stylizer:
             nstyle = None if next is None else self.stylizer.style(next)
         if child is not None:
             if not preserve \
                and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \
                and child.tail and child.tail.isspace():
                 child.tail = None
             self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
         child, cstyle = next, nstyle
     parents.pop()
     preserve = old_preserve
     if not flags & FLAG_CLOSING:
         self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0)
     if elem.tail and tag != 'html':
         tail = elem.tail
         if not preserve:
             tail = COLLAPSE.sub(' ', tail)
         self.write(tail)
     if style and style['page-break-after'] not in ('avoid', 'auto'):
         self.page_breaks.append((self.buf.tell(), list(parents)))
Пример #32
0
        ans = None  # invalid tweak value
    try:
        ans = frozenset(ans) if ans else frozenset(data['eng'])
    except:
        ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
    ans = '|'.join(ans)
    ans = '^(%s)'%ans
    try:
        ans = re.compile(ans, re.IGNORECASE)
    except:
        ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
    _title_pats[lang] = ans
    return ans


_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
        list(range(0x2018, 0x201e))+[0x2032, 0x2033])


def title_sort(title, order=None, lang=None):
    if order is None:
        order = tweaks['title_series_sorting']
    title = title.strip()
    if order == 'strictly_alphabetic':
        return title
    if title and title[0] in _ignore_starts:
        title = title[1:]
    match = get_title_sort_pat(lang).search(title)
    if match:
        try:
            prep = match.group(1)
Пример #33
0
        ans = None  # invalid tweak value
    try:
        ans = frozenset(ans) if ans else frozenset(data['eng'])
    except:
        ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
    ans = '|'.join(ans)
    ans = '^(%s)'%ans
    try:
        ans = re.compile(ans, re.IGNORECASE)
    except:
        ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
    _title_pats[lang] = ans
    return ans


_ignore_starts = u'\'"'+u''.join(codepoint_to_chr(x) for x in
        list(range(0x2018, 0x201e))+[0x2032, 0x2033])


def title_sort(title, order=None, lang=None):
    if order is None:
        order = tweaks['title_series_sorting']
    title = title.strip()
    if order == 'strictly_alphabetic':
        return title
    if title and title[0] in _ignore_starts:
        title = title[1:]
    match = get_title_sort_pat(lang).search(title)
    if match:
        try:
            prep = match.group(1)
Пример #34
0
import re, urllib, urlparse, socket

from mechanize import URLError

from calibre.ebooks.metadata.book.base import Metadata
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode
from polyglot.builtins import codepoint_to_chr, unicode_type

URL = \
"http://ww2.kdl.org/libcat/WhatsNext.asp?AuthorLastName={0}&AuthorFirstName=&SeriesName=&BookTitle={1}&CategoryID=0&cmdSearch=Search&Search=1&grouping="

_ignore_starts = u'\'"' + u''.join(
    codepoint_to_chr(x) for x in range(0x2018, 0x201e) + [0x2032, 0x2033])


def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode_type):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
Пример #35
0
 def uni(match):
     try:
         return codepoint_to_chr(int(match.group(1)))
     except Exception:
         return '?'
Пример #36
0
    def process_phtml(self, d, paragraph_offsets=[]):
        html = u'<p id="p0">'
        offset = 0
        paragraph_open = True
        link_open = False
        need_set_p_id = False
        p_num = 1
        font_specifier_close = ''

        while offset < len(d):
            if not paragraph_open:
                if need_set_p_id:
                    html += u'<p id="p%s">' % p_num
                    p_num += 1
                    need_set_p_id = False
                else:
                    html += u'<p>'
                paragraph_open = True

            c = ord(d[offset])
            # PHTML "functions"
            if c == 0x0:
                offset += 1
                c = ord(d[offset])
                # Page link begins
                # 2 Bytes
                # record ID
                if c == 0x0a:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset + 2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html">' % id
                        link_open = True
                    offset += 1
                # Targeted page link begins
                # 3 Bytes
                # record ID, target
                elif c == 0x0b:
                    offset += 3
                # Paragraph link begins
                # 4 Bytes
                # record ID, paragraph number
                elif c == 0x0c:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset + 2])[0]
                    offset += 2
                    pid = struct.unpack('>H', d[offset:offset + 2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html#p%s">' % (id, pid)
                        link_open = True
                    offset += 1
                # Targeted paragraph link begins
                # 5 Bytes
                # record ID, paragraph number, target
                elif c == 0x0d:
                    offset += 5
                # Link ends
                # 0 Bytes
                elif c == 0x08:
                    if link_open:
                        html += '</a>'
                        link_open = False
                # Set font
                # 1 Bytes
                # font specifier
                elif c == 0x11:
                    offset += 1
                    specifier = d[offset]
                    html += font_specifier_close
                    # Regular text
                    if specifier == 0:
                        font_specifier_close = ''
                    # h1
                    elif specifier == 1:
                        html += '<h1>'
                        font_specifier_close = '</h1>'
                    # h2
                    elif specifier == 2:
                        html += '<h2>'
                        font_specifier_close = '</h2>'
                    # h3
                    elif specifier == 3:
                        html += '<h13>'
                        font_specifier_close = '</h3>'
                    # h4
                    elif specifier == 4:
                        html += '<h4>'
                        font_specifier_close = '</h4>'
                    # h5
                    elif specifier == 5:
                        html += '<h5>'
                        font_specifier_close = '</h5>'
                    # h6
                    elif specifier == 6:
                        html += '<h6>'
                        font_specifier_close = '</h6>'
                    # Bold
                    elif specifier == 7:
                        html += '<b>'
                        font_specifier_close = '</b>'
                    # Fixed-width
                    elif specifier == 8:
                        html += '<tt>'
                        font_specifier_close = '</tt>'
                    # Small
                    elif specifier == 9:
                        html += '<small>'
                        font_specifier_close = '</small>'
                    # Subscript
                    elif specifier == 10:
                        html += '<sub>'
                        font_specifier_close = '</sub>'
                    # Superscript
                    elif specifier == 11:
                        html += '<sup>'
                        font_specifier_close = '</sup>'
                # Embedded image
                # 2 Bytes
                # image record ID
                elif c == 0x1a:
                    offset += 1
                    uid = struct.unpack('>H', d[offset:offset + 2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Set margin
                # 2 Bytes
                # left margin, right margin
                elif c == 0x22:
                    offset += 2
                # Alignment of text
                # 1 Bytes
                # alignment
                elif c == 0x29:
                    offset += 1
                # Horizontal rule
                # 3 Bytes
                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
                elif c == 0x33:
                    offset += 3
                    if paragraph_open:
                        html += u'</p>'
                        paragraph_open = False
                    html += u'<hr />'
                # New line
                # 0 Bytes
                elif c == 0x38:
                    if paragraph_open:
                        html += u'</p>\n'
                        paragraph_open = False
                # Italic text begins
                # 0 Bytes
                elif c == 0x40:
                    html += u'<i>'
                # Italic text ends
                # 0 Bytes
                elif c == 0x48:
                    html += u'</i>'
                # Set text color
                # 3 Bytes
                # 8-bit red, 8-bit green, 8-bit blue
                elif c == 0x53:
                    offset += 3
                # Multiple embedded image
                # 4 Bytes
                # alternate image record ID, image record ID
                elif c == 0x5c:
                    offset += 3
                    uid = struct.unpack('>H', d[offset:offset + 2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Underline text begins
                # 0 Bytes
                elif c == 0x60:
                    html += u'<u>'
                # Underline text ends
                # 0 Bytes
                elif c == 0x68:
                    html += u'</u>'
                # Strike-through text begins
                # 0 Bytes
                elif c == 0x70:
                    html += u'<s>'
                # Strike-through text ends
                # 0 Bytes
                elif c == 0x78:
                    html += u'</s>'
                # 16-bit Unicode character
                # 3 Bytes
                # alternate text length, 16-bit unicode character
                elif c == 0x83:
                    offset += 3
                # 32-bit Unicode character
                # 5 Bytes
                # alternate text length, 32-bit unicode character
                elif c == 0x85:
                    offset += 5
                # Begin custom font span
                # 6 Bytes
                # font page record ID, X page position, Y page position
                elif c == 0x8e:
                    offset += 6
                # Adjust custom font glyph position
                # 4 Bytes
                # X page position, Y page position
                elif c == 0x8c:
                    offset += 4
                # Change font page
                # 2 Bytes
                # font record ID
                elif c == 0x8a:
                    offset += 2
                # End custom font span
                # 0 Bytes
                elif c == 0x88:
                    pass
                # Begin new table row
                # 0 Bytes
                elif c == 0x90:
                    pass
                # Insert table (or table link)
                # 2 Bytes
                # table record ID
                elif c == 0x92:
                    offset += 2
                # Table cell data
                # 7 Bytes
                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
                elif c == 0x97:
                    offset += 7
                # Exact link modifier
                # 2 Bytes
                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or
                # Targeted Paragraph Link function to specify an exact byte offset within
                # the paragraph. This function must be followed immediately by the
                # function it modifies).
                elif c == 0x9a:
                    offset += 2
            elif c == 0xa0:
                html += '&nbsp;'
            else:
                html += codepoint_to_chr(c)
            offset += 1
            if offset in paragraph_offsets:
                need_set_p_id = True
                if paragraph_open:
                    html += u'</p>\n'
                    paragraph_open = False

        if paragraph_open:
            html += u'</p>'

        return html
Пример #37
0
    def process_phtml(self, d, paragraph_offsets=[]):
        html = u'<p id="p0">'
        offset = 0
        paragraph_open = True
        link_open = False
        need_set_p_id = False
        p_num = 1
        font_specifier_close = ''

        while offset < len(d):
            if not paragraph_open:
                if need_set_p_id:
                    html += u'<p id="p%s">' % p_num
                    p_num += 1
                    need_set_p_id = False
                else:
                    html += u'<p>'
                paragraph_open = True

            c = ord(d[offset])
            # PHTML "functions"
            if c == 0x0:
                offset += 1
                c = ord(d[offset])
                # Page link begins
                # 2 Bytes
                # record ID
                if c == 0x0a:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset+2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html">' % id
                        link_open = True
                    offset += 1
                # Targeted page link begins
                # 3 Bytes
                # record ID, target
                elif c == 0x0b:
                    offset += 3
                # Paragraph link begins
                # 4 Bytes
                # record ID, paragraph number
                elif c == 0x0c:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset+2])[0]
                    offset += 2
                    pid = struct.unpack('>H', d[offset:offset+2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html#p%s">' % (id, pid)
                        link_open = True
                    offset += 1
                # Targeted paragraph link begins
                # 5 Bytes
                # record ID, paragraph number, target
                elif c == 0x0d:
                    offset += 5
                # Link ends
                # 0 Bytes
                elif c == 0x08:
                    if link_open:
                        html += '</a>'
                        link_open = False
                # Set font
                # 1 Bytes
                # font specifier
                elif c == 0x11:
                    offset += 1
                    specifier = d[offset]
                    html += font_specifier_close
                    # Regular text
                    if specifier == 0:
                        font_specifier_close = ''
                    # h1
                    elif specifier == 1:
                        html += '<h1>'
                        font_specifier_close = '</h1>'
                    # h2
                    elif specifier == 2:
                        html += '<h2>'
                        font_specifier_close = '</h2>'
                    # h3
                    elif specifier == 3:
                        html += '<h13>'
                        font_specifier_close = '</h3>'
                    # h4
                    elif specifier == 4:
                        html += '<h4>'
                        font_specifier_close = '</h4>'
                    # h5
                    elif specifier == 5:
                        html += '<h5>'
                        font_specifier_close = '</h5>'
                    # h6
                    elif specifier == 6:
                        html += '<h6>'
                        font_specifier_close = '</h6>'
                    # Bold
                    elif specifier == 7:
                        html += '<b>'
                        font_specifier_close = '</b>'
                    # Fixed-width
                    elif specifier == 8:
                        html += '<tt>'
                        font_specifier_close = '</tt>'
                    # Small
                    elif specifier == 9:
                        html += '<small>'
                        font_specifier_close = '</small>'
                    # Subscript
                    elif specifier == 10:
                        html += '<sub>'
                        font_specifier_close = '</sub>'
                    # Superscript
                    elif specifier == 11:
                        html += '<sup>'
                        font_specifier_close = '</sup>'
                # Embedded image
                # 2 Bytes
                # image record ID
                elif c == 0x1a:
                    offset += 1
                    uid = struct.unpack('>H', d[offset:offset+2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Set margin
                # 2 Bytes
                # left margin, right margin
                elif c == 0x22:
                    offset += 2
                # Alignment of text
                # 1 Bytes
                # alignment
                elif c == 0x29:
                    offset += 1
                # Horizontal rule
                # 3 Bytes
                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
                elif c == 0x33:
                    offset += 3
                    if paragraph_open:
                        html += u'</p>'
                        paragraph_open = False
                    html += u'<hr />'
                # New line
                # 0 Bytes
                elif c == 0x38:
                    if paragraph_open:
                        html += u'</p>\n'
                        paragraph_open = False
                # Italic text begins
                # 0 Bytes
                elif c == 0x40:
                    html += u'<i>'
                # Italic text ends
                # 0 Bytes
                elif c == 0x48:
                    html += u'</i>'
                # Set text color
                # 3 Bytes
                # 8-bit red, 8-bit green, 8-bit blue
                elif c == 0x53:
                    offset += 3
                # Multiple embedded image
                # 4 Bytes
                # alternate image record ID, image record ID
                elif c == 0x5c:
                    offset += 3
                    uid = struct.unpack('>H', d[offset:offset+2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Underline text begins
                # 0 Bytes
                elif c == 0x60:
                    html += u'<u>'
                # Underline text ends
                # 0 Bytes
                elif c == 0x68:
                    html += u'</u>'
                # Strike-through text begins
                # 0 Bytes
                elif c == 0x70:
                    html += u'<s>'
                # Strike-through text ends
                # 0 Bytes
                elif c == 0x78:
                    html += u'</s>'
                # 16-bit Unicode character
                # 3 Bytes
                # alternate text length, 16-bit unicode character
                elif c == 0x83:
                    offset += 3
                # 32-bit Unicode character
                # 5 Bytes
                # alternate text length, 32-bit unicode character
                elif c == 0x85:
                    offset += 5
                # Begin custom font span
                # 6 Bytes
                # font page record ID, X page position, Y page position
                elif c == 0x8e:
                    offset += 6
                # Adjust custom font glyph position
                # 4 Bytes
                # X page position, Y page position
                elif c == 0x8c:
                    offset += 4
                # Change font page
                # 2 Bytes
                # font record ID
                elif c == 0x8a:
                    offset += 2
                # End custom font span
                # 0 Bytes
                elif c == 0x88:
                    pass
                # Begin new table row
                # 0 Bytes
                elif c == 0x90:
                    pass
                # Insert table (or table link)
                # 2 Bytes
                # table record ID
                elif c == 0x92:
                    offset += 2
                # Table cell data
                # 7 Bytes
                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
                elif c == 0x97:
                    offset += 7
                # Exact link modifier
                # 2 Bytes
                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or
                # Targeted Paragraph Link function to specify an exact byte offset within
                # the paragraph. This function must be followed immediately by the
                # function it modifies).
                elif c == 0x9a:
                    offset += 2
            elif c == 0xa0:
                html += '&nbsp;'
            else:
                html += codepoint_to_chr(c)
            offset += 1
            if offset in paragraph_offsets:
                need_set_p_id = True
                if paragraph_open:
                    html += u'</p>\n'
                    paragraph_open = False

        if paragraph_open:
            html += u'</p>'

        return html
Пример #38
0
 def uni(match):
     return codepoint_to_chr(int(match.group(1)))
Пример #39
0
    try:
        ans = frozenset(ans) if ans else frozenset(data['eng'])
    except:
        ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
    ans = '|'.join(ans)
    ans = '^(%s)' % ans
    try:
        ans = re.compile(ans, re.IGNORECASE)
    except:
        ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
    _title_pats[lang] = ans
    return ans


_ignore_starts = u'\'"' + u''.join(
    codepoint_to_chr(x)
    for x in list(range(0x2018, 0x201e)) + [0x2032, 0x2033])


def title_sort(title, order=None, lang=None):
    if order is None:
        order = tweaks['title_series_sorting']
    title = title.strip()
    if order == 'strictly_alphabetic':
        return title
    if title and title[0] in _ignore_starts:
        title = title[1:]
    match = get_title_sort_pat(lang).search(title)
    if match:
        try:
            prep = match.group(1)
Пример #40
0
 def unescape_entity(m):
     try:
         return codepoint_to_chr(name2codepoint[m.group(1)])
     except KeyError:
         return m.group(0)  # use as is