def prepare_pml(self, pml): # Give Chapters the form \\*='text'text\\*. This is used for generating # the TOC later. pml = re.sub(r'(?msu)(?P<c>\\x)(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' % (match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml) pml = re.sub(r'(?msu)(?P<c>\\X[0-4])(?P<text>.*?)(?P=c)', lambda match: '%s="%s"%s%s' % (match.group('c'), self.strip_pml(match.group('text')), match.group('text'), match.group('c')), pml) # Remove comments pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml) # Remove extra white spaces. pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml) pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml) pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml) pml = re.sub(r'(?mus)^[ ]*$', '', pml) # Footnotes and Sidebars. pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) # Convert &'s into entities so & in the text doesn't get turned into # &. It will display as & pml = pml.replace('&', '&') # Replace \\a and \\U with either the unicode character or the entity. pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml) pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) pml = prepare_string_for_xml(pml) return pml
def tostring(raw, **kwargs): ''' lxml *sometimes* represents non-ascii characters as hex entities in attribute values. I can't figure out exactly what circumstances cause it. It seems to happen when serializing a part of a larger tree. Since we need serialization to be the same when serializing full and partial trees, we manually replace all hex entities with their unicode codepoints. ''' xml_declaration = kwargs.pop('xml_declaration', False) encoding = kwargs.pop('encoding', 'UTF-8') kwargs['encoding'] = unicode_type kwargs['xml_declaration'] = False ans = etree.tostring(raw, **kwargs) if xml_declaration: ans = '<?xml version="1.0" encoding="%s"?>\n' % encoding + ans return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m: my_unichr(int(m.group(1), 16)), ans).encode(encoding)