def tokens(self, text): xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags) xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler) text = ''.join(xhtmlsaxhandler.text) #_inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' #inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) #inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken( XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) #for m in inlcodes: # # XXX break line in HTML is coded with paragraph styling and this linearizing # # does not help # tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1)) # tokentext = core.InlCodeToken.linearize(tokentext) # token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) # tokens.append(token) for m in blkcodes: tokentext = m.group(1).lstrip('\n').rstrip('\n ') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken( XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): oosaxhandler = OOSaxHandler(style=self.style) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, oosaxhandler) text = ''.join(oosaxhandler.text) _inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: # XXX break line in OO is coded with paragraph styling and this linearizing # does not help tokentext = OOSaxHandler._decode_spec_chars(m.group(1)) tokentext = core.InlCodeToken.linearize(tokentext) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: token = core.BlkCodeToken( OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) #print [t.text for t in tokens] return tokens
def tokens(self, text): xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags) xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler) text = ''.join(xhtmlsaxhandler.text) #_inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' #inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.DOTALL|re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) #inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) #for m in inlcodes: # # XXX break line in HTML is coded with paragraph styling and this linearizing # # does not help # tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1)) # tokentext = core.InlCodeToken.linearize(tokentext) # token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) # tokens.append(token) for m in blkcodes: tokentext = m.group(1).lstrip('\n').rstrip('\n ') tokentext = core.deltextindent(tokentext) token = core.BlkCodeToken(XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) return tokens
def tokens(self, text): oosaxhandler = OOSaxHandler(style=self.style) # next conversion seems bug in python (why need bytes, not str?!) if type(text) is str: parsetext = core.strtobytes23(text) else: parsetext = text sax.parseString(parsetext, oosaxhandler) text = ''.join(oosaxhandler.text) _inlcode_re = r'@start_inline@(.*?)@end_inline@' _blkcode_re = r'@start_block@(.*?)@end_block@' inlcode_re = re.compile(_inlcode_re, re.MULTILINE) blkcode_re = re.compile(_blkcode_re, re.MULTILINE) cmds = core.Cmd.syntax.findtokens('cmddef', text) inlcodes = inlcode_re.finditer(text) blkcodes = blkcode_re.finditer(text) tokens = [] for m in cmds: token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) for m in inlcodes: # XXX break line in OO is coded with paragraph styling and this linearizing # does not help tokentext = OOSaxHandler._decode_spec_chars(m.group(1)) tokentext = core.InlCodeToken.linearize(tokentext) token = core.InlCodeToken(tokentext, m.start(0), m.end(0)) tokens.append(token) for m in blkcodes: token = core.BlkCodeToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0)) tokens.append(token) tokens.sort(key=lambda tok: tok.start) tokens.append(core.EndToken(None)) #print [t.text for t in tokens] return tokens