def __init__(self, lyx, outcb=None, debugcb=None, debug_level=0): self.app_debugcb = debugcb self.debug_level = debug_level if debugcb: self.dbg = self._debugcb else: self.dbg = self._null_debugcb self.stack = [] self.i = 0 if type(lyx) == list: self.lines = lyx elif type(lyx) == str: self.lines = _read_lyx(lyx) else: assert lyx.readlines self.lines = lyx.readlines() self.xout = XmlStreamer(outcb or LyX2XML._outcb, 'lyx') # No DTD... # XXX Should move the rest of this to a 'convert' method. # # Fix the lack of XML-ish nesting of things like \series, \emph, # \family, \color, \shape, and \lang self._fix_text_styling() # Uncomment to save a copy of the .lyx with fixed styling, for # debugging purposes: #f = open('/tmp/f', 'w+') #for i in range(len(self.lines)): # f.write(self.lines[i]) # if self.lines[i][-1] != '\n': # f.write('\n') #f.close() if self.debug_level >= 3: self.dbg(3, 'Fixed lines:\n') for i in range(len(self.lines)): self.dbg(3, '%d %s' % (i + 1, self.lines[i])) self._lyx2xml() self.xout.finish() return None
class LyX2XML(object): debug = False def _debugcb(self, level, text): if level <= self.debug_level: self.app_debugcb(text) def _null_debugcb(self, level, text): pass def _outcb(self, text): sys.stdout.write(text) def __init__(self, lyx, outcb=None, debugcb=None, debug_level=0): self.app_debugcb = debugcb self.debug_level = debug_level if debugcb: self.dbg = self._debugcb else: self.dbg = self._null_debugcb self.stack = [] self.i = 0 if type(lyx) == list: self.lines = lyx elif type(lyx) == str: self.lines = _read_lyx(lyx) else: assert lyx.readlines self.lines = lyx.readlines() self.xout = XmlStreamer(outcb or LyX2XML._outcb, 'lyx') # No DTD... # XXX Should move the rest of this to a 'convert' method. # # Fix the lack of XML-ish nesting of things like \series, \emph, # \family, \color, \shape, and \lang self._fix_text_styling() # Uncomment to save a copy of the .lyx with fixed styling, for # debugging purposes: #f = open('/tmp/f', 'w+') #for i in range(len(self.lines)): # f.write(self.lines[i]) # if self.lines[i][-1] != '\n': # f.write('\n') #f.close() if self.debug_level >= 3: self.dbg(3, 'Fixed lines:\n') for i in range(len(self.lines)): self.dbg(3, '%d %s' % (i + 1, self.lines[i])) self._lyx2xml() self.xout.finish() return None # Returns one past the new location of self.lines[i], after possibly # inserting lines to close and re-open tags as implied by # self.lines[i], to make it possible to emit proper XML. def _close_and_reopen_styling(self, i, tag, new_style): lines = self.lines stack = self.stack found = False # XXX Could use an __in__ operator overloading for this code (or # could use lists' in operator with a list comprehension to make # a list with just the first element of each tuple in stack): for k in range(len(stack) - 1, -1, -1): if stack[k][0] == tag: found = True break if not found: # We're just opening a tag that's not already in the stack. # Not much to do in this case. assert new_style != mixed_tags[tag] self.dbg(2, 'adding (\\%s, %s) to the stack (%s) at %d' % (tag, new_style, repr(stack), i)) stack.append((tag, new_style)) return i + 1 # OK, we have work to do: close all intervening tags, close this # one, re-open all those tags and this one. self.dbg(2, 'fixing ordering for \\%s at line %d, stack = %s' % (tag, i, repr(stack))) m = -1 for k in range(len(stack) - 1, -1, -1): # Close tags m = k # If this element in the stack is not the one we'll # terminate at (tag) or if it is but we're opening a new # style: if stack[k][0] != tag or new_style != mixed_tags[tag]: # Close the tag stack[k][0] self.dbg(2, 'fixing ordering for \\%s by closing %s %s to re-open it' % (tag, stack[k][0], stack[k][1])) lines.insert(i, '\\' + stack[k][0] + ' ' + mixed_tags[stack[k][0]]) i += 1 # keep i pointing at the line triggering all this if stack[k][0] == tag: stack[k] = (tag, new_style) m += 1 break else: assert stack[k][0] == tag and new_style == mixed_tags[tag] # This line closes a tag, so just remove the # corresponding entry in the stack (any others before # this one will have been closed above). del(stack[k]) break # Now re-open those intervening tags that we closed; we do this # *after* the line triggering all this, since that's the one # opening a tag. i += 1 k = m while k > -1 and k < len(stack): lines.insert(i, '\\' + stack[k][0] + ' ' + stack[k][1]) k += 1 i += 1 self.dbg(5, 'foo at %d: %s' % (i, lines[i])) return i def _fix_text_styling(self): lines = self.lines self.stack = [] stack = self.stack fixes = [] # First we'll rename the \color in the header, if any. i = find_tokens(lines, ('\\color', '\\end_header'), 0) if i != -1 and \ check_token(lines[i], '\\color') and \ not get_containing_layout(lines, i): lines[i] = '\\color_in_header ' + lines[i].split()[1] # Now let's get on with the rest i = 0 while i < len(lines): line = lines[i] self.dbg(4, 'looking at line %d: %s' % (i, line)) # XXX We really should simplify all this startswith() # nonsense. if line.startswith('\\end_'): self.dbg(2, 'fixing unended style tags %s' % (repr(stack))) for k in range(len(stack) - 1, -1, -1): lines.insert(i, '\\' + stack[k][0] + ' ' + mixed_tags[stack[k][0]]) i += 1 del(stack[0:]) i += 1 continue if not line.startswith('\\') or line.find(' ') == -1: self.dbg(5, '1 i++ at %d' % (i,)) i += 1 continue # XXX And this parsing nonsense needs refactoring too line = _chomp(line[1:]) a = line[0:line.find(' ')] if not a in mixed_tags: self.dbg(5, '2 i++ at %d' % (i,)) i += 1 continue v = line[line.find(' ') + 1:] # We're opening a new whatever it is. But we need to handle # the possibility that we're changing the whatever it is! # How to handle this? We could convert: # \lang french foo \lang spanish bar \lang english foobar # to # <lang a="french">foo<lang a="spanish">bar</lang></lang> # or to # <lang a="french">foo</lang><lang a="spanish">bar</lang> # The former might be easier: just close all tags up to the # farthest one in the stack for the tag being closed, then # re-open any others that were found along the way. But the # latter is more sensible, so that's what we do. # Invariant: we never have more than one style tag in the # stack (XXX assert this in code). self.dbg(4, 'seen \\%s at line %d' % (line, i)) i = self._close_and_reopen_styling(i, a, v) self.dbg(5, '3 i++ at %d' % (i,)) def _lyxml2xml(self, start, end): lines = self.lines xout = self.xout i = start while i < end: self.dbg(3, 'Parsing line %d: %s' % (i, lines[i])) if lines[i].startswith('</'): xout.end_elt(lines[i][2:-2]) i += 1 elif lines[i].startswith('<'): (tag, attrs, start_tok, end_tok) = _parse_xml_tag(lines[i]) # There don't seem to be tags with no child nodes in .lyx, # but let's handle them just in case. if lines[i][-2] != '/': e = find_end_of(lines, i, start_tok, end_tok) self.dbg(3, 'find_end_of(%d, %s, %s) = %d' % (i, start_tok, end_tok, e)) # lyxtabular's <column> and <features> don't get closed! # What a mess. if e == -1: e = None else: e = None xout.start_elt(tag) xout.attr('embedded_xml', 'true') for a in attrs: xout.attr(a[0], a[1]) if e: i = self._lyxml2xml(i + 1, e) self.dbg(3, '_lyx2xml(...) = %d, looking for %d' % (i, e)) if i + 1 == e: i += 1 else: self.dbg(3, '_lyx2xml() returned %d, e = %d' % (i, e)) else: xout.end_elt(tag) i += 1 else: i = self._lyx2xml(i, end) return i def _handle_interspersed_attrs(self, start, end): lines = self.lines xout = self.xout i = start depth = 0 while i < end: if lines[i].startswith('\\begin_') or lines[i].startswith('\\index '): depth += 1 elif lines[i].startswith('\\end_'): depth -= 1 elif depth == 0 and lines[i].startswith('\\'): (a, v) = _parse_attr(lines[i]) if a and v: xout.attr(a, v) if a == 'language': # Set the default language; needed for the style # tag fix code. mixed_tags['lang'] = v self.dbg(2, 'Default language is %s' % (v,)) i += 1 def _lyx2xml(self, start=0, end=-1, cmd_type=None): lines = self.lines xout = self.xout i = start if end < 0: end = len(lines) prev_i = -1 while i < end: self.dbg(3, 'Parsing line %d: %s' % (i, lines[i])) assert i > prev_i prev_i = i if len(lines[i]) == 0 or lines[i] == ' ': # Ignore empty lines i += 1 cmd_type = None elif lines[i][0] == '#': # LyX source comment #xout.comment(lines[i][1:]) i += 1 cmd_type = None elif _beginner(lines[i]): (el, start_tok, end_tok, cmd_type, rest) = _parse_begin(lines[i]) xout.start_elt(el) if el == 'document': # Default XML namespace xout.attr('xmlns', 'urn:cryptonector.com:lyx-other') # We put layouts, insets, and custome insets into # separate namespaces xout.attr('xmlns:layout', 'urn:cryptonector.com:lyx-layout') xout.attr('xmlns:inset', 'urn:cryptonector.com:lyx-inset') xout.attr('xmlns:flex', 'urn:cryptonector.com:lyx-flex') # XXX MathML not yet implemented; we get to turn LyX # formulas into MathML, joy! But it looks like # LaTeX math, and there's tools for converting that # to MathML, so, hey, it might not be too much work. xout.attr('xmlns:math', 'http://www.w3.org/1998/Math/MathML') # Suck in \lyxformat, which precedes \begin_document, # and anything else that might be there (though # nothing will be). self._handle_interspersed_attrs(0, i) if (el.startswith('inset:') or el.startswith('flex:')) and \ not cmd_type and (i + 1) < end and \ lines[i + 1].startswith('status '): i += 1 # skip status open|collapsed line status = _chomp(lines[i][lines[i].find(' ') + 1:]) else: status = None self.dbg(4, 'lines[%d] = %s' % (i, lines[i])) e = find_end_of(lines, i, start_tok, end_tok) assert e != -1 self.dbg(4, 'find_end_of(%d, %s, %s) = %d' % (i, start_tok, end_tok, e)) # XXX Here we need to find any attributes that might be # interspersed with child nodes so we can suck them in # first. What a PITA. self._handle_interspersed_attrs(i + 1, e - 1) if status: xout.attr('status', status) if len(rest) == 2 and el != 'inset:Formula': xout.attr(rest[0], escape(rest[1])) i = self._lyx2xml(i + 1, e, cmd_type) self.dbg(4, '_lyx2xml(...) = %d, looking for %s at %d; end = %d' % (i, end_tok, e, end)) if i + 1 == e: i += 1 else: self.dbg(4, '_lyx2xml() returned %d, e = %d; end = %d' % (i, e, end)) assert lines[i].startswith('\\end_') if len(rest) == 2 and el == 'inset:Formula': xout.text(' ') xout.text(escape(rest[1])) xout.end_elt(el) cmd_type = None i += 1 elif cmd_type == 'inset': # Parse "\begin_inset CommandInset ..." attributes self.dbg(4, 'lines[%d] = %s' % (i, lines[i])) while i < end and lines[i] != '' and lines[i] != ' ': (a, v) = _parse_attr(lines[i]) if not a or not v: break xout.attr(a, v) i += 1 # then suck in content cmd_type = None elif cmd_type == 'XML': # Parse embedded XML contents i = self._lyxml2xml(i, end) cmd_type = None elif lines[i][0] == '\\' and \ not lines[i].startswith('\\begin_') and \ not lines[i].startswith('\\end_') and \ not _key(lines[i]) in mixed_tags: # An attribute, which we've handled above with the call to # _handle_interspersed_attrs(). i += 1 else: line = lines[i] if xout.stack[-1] == 'layout': line = _chomp(line) key = _key(line) if key in mixed_tags: val = _chomp(lines[i][lines[i].find(' ') + 1:]) if val == mixed_tags[key]: xout.end_elt(key) else: xout.start_elt(key) xout.attr('type', val) else: xout.text(escape(line)) cmd_type = None i += 1 return i
class LyX2XML(object): debug = False def _debugcb(self, level, text): if level <= self.debug_level: self.app_debugcb(text) def _null_debugcb(self, level, text): pass def _outcb(self, text): sys.stdout.write(text) def __init__(self, lyx, outcb=None, debugcb=None, debug_level=0): self.app_debugcb = debugcb self.debug_level = debug_level if debugcb: self.dbg = self._debugcb else: self.dbg = self._null_debugcb self.stack = [] self.i = 0 if type(lyx) == list: self.lines = lyx elif type(lyx) == str: self.lines = _read_lyx(lyx) else: assert lyx.readlines self.lines = lyx.readlines() self.xout = XmlStreamer(outcb or LyX2XML._outcb, 'lyx') # No DTD... # XXX Should move the rest of this to a 'convert' method. # # Fix the lack of XML-ish nesting of things like \series, \emph, # \family, \color, \shape, and \lang self._fix_text_styling() # Uncomment to save a copy of the .lyx with fixed styling, for # debugging purposes: #f = open('/tmp/f', 'w+') #for i in range(len(self.lines)): # f.write(self.lines[i]) # if self.lines[i][-1] != '\n': # f.write('\n') #f.close() if self.debug_level >= 3: self.dbg(3, 'Fixed lines:\n') for i in range(len(self.lines)): self.dbg(3, '%d %s' % (i + 1, self.lines[i])) self._lyx2xml() self.xout.finish() return None # Returns one past the new location of self.lines[i], after possibly # inserting lines to close and re-open tags as implied by # self.lines[i], to make it possible to emit proper XML. def _close_and_reopen_styling(self, i, tag, new_style): lines = self.lines stack = self.stack found = False # XXX Could use an __in__ operator overloading for this code (or # could use lists' in operator with a list comprehension to make # a list with just the first element of each tuple in stack): for k in range(len(stack) - 1, -1, -1): if stack[k][0] == tag: found = True break if not found: # We're just opening a tag that's not already in the stack. # Not much to do in this case. assert new_style != mixed_tags[tag] self.dbg( 2, 'adding (\\%s, %s) to the stack (%s) at %d' % (tag, new_style, repr(stack), i)) stack.append((tag, new_style)) return i + 1 # OK, we have work to do: close all intervening tags, close this # one, re-open all those tags and this one. self.dbg( 2, 'fixing ordering for \\%s at line %d, stack = %s' % (tag, i, repr(stack))) m = -1 for k in range(len(stack) - 1, -1, -1): # Close tags m = k # If this element in the stack is not the one we'll # terminate at (tag) or if it is but we're opening a new # style: if stack[k][0] != tag or new_style != mixed_tags[tag]: # Close the tag stack[k][0] self.dbg( 2, 'fixing ordering for \\%s by closing %s %s to re-open it' % (tag, stack[k][0], stack[k][1])) lines.insert( i, '\\' + stack[k][0] + ' ' + mixed_tags[stack[k][0]]) i += 1 # keep i pointing at the line triggering all this if stack[k][0] == tag: stack[k] = (tag, new_style) m += 1 break else: assert stack[k][0] == tag and new_style == mixed_tags[tag] # This line closes a tag, so just remove the # corresponding entry in the stack (any others before # this one will have been closed above). del (stack[k]) break # Now re-open those intervening tags that we closed; we do this # *after* the line triggering all this, since that's the one # opening a tag. i += 1 k = m while k > -1 and k < len(stack): lines.insert(i, '\\' + stack[k][0] + ' ' + stack[k][1]) k += 1 i += 1 self.dbg(5, 'foo at %d: %s' % (i, lines[i])) return i def _fix_text_styling(self): lines = self.lines self.stack = [] stack = self.stack fixes = [] # First we'll rename the \color in the header, if any. i = find_tokens(lines, ('\\color', '\\end_header'), 0) if i != -1 and \ check_token(lines[i], '\\color') and \ not get_containing_layout(lines, i): lines[i] = '\\color_in_header ' + lines[i].split()[1] # Now let's get on with the rest i = 0 while i < len(lines): line = lines[i] self.dbg(4, 'looking at line %d: %s' % (i, line)) # XXX We really should simplify all this startswith() # nonsense. if line.startswith('\\end_'): self.dbg(2, 'fixing unended style tags %s' % (repr(stack))) for k in range(len(stack) - 1, -1, -1): lines.insert( i, '\\' + stack[k][0] + ' ' + mixed_tags[stack[k][0]]) i += 1 del (stack[0:]) i += 1 continue if not line.startswith('\\') or line.find(' ') == -1: self.dbg(5, '1 i++ at %d' % (i, )) i += 1 continue # XXX And this parsing nonsense needs refactoring too line = _chomp(line[1:]) a = line[0:line.find(' ')] if not a in mixed_tags: self.dbg(5, '2 i++ at %d' % (i, )) i += 1 continue v = line[line.find(' ') + 1:] # We're opening a new whatever it is. But we need to handle # the possibility that we're changing the whatever it is! # How to handle this? We could convert: # \lang french foo \lang spanish bar \lang english foobar # to # <lang a="french">foo<lang a="spanish">bar</lang></lang> # or to # <lang a="french">foo</lang><lang a="spanish">bar</lang> # The former might be easier: just close all tags up to the # farthest one in the stack for the tag being closed, then # re-open any others that were found along the way. But the # latter is more sensible, so that's what we do. # Invariant: we never have more than one style tag in the # stack (XXX assert this in code). self.dbg(4, 'seen \\%s at line %d' % (line, i)) i = self._close_and_reopen_styling(i, a, v) self.dbg(5, '3 i++ at %d' % (i, )) def _lyxml2xml(self, start, end): lines = self.lines xout = self.xout i = start while i < end: self.dbg(3, 'Parsing line %d: %s' % (i, lines[i])) if lines[i].startswith('</'): xout.end_elt(lines[i][2:-2]) i += 1 elif lines[i].startswith('<'): (tag, attrs, start_tok, end_tok) = _parse_xml_tag(lines[i]) # There don't seem to be tags with no child nodes in .lyx, # but let's handle them just in case. if lines[i][-2] != '/': e = find_end_of(lines, i, start_tok, end_tok) self.dbg( 3, 'find_end_of(%d, %s, %s) = %d' % (i, start_tok, end_tok, e)) # lyxtabular's <column> and <features> don't get closed! # What a mess. if e == -1: e = None else: e = None xout.start_elt(tag) xout.attr('embedded_xml', 'true') for a in attrs: xout.attr(a[0], a[1]) if e: i = self._lyxml2xml(i + 1, e) self.dbg(3, '_lyx2xml(...) = %d, looking for %d' % (i, e)) if i + 1 == e: i += 1 else: self.dbg(3, '_lyx2xml() returned %d, e = %d' % (i, e)) else: xout.end_elt(tag) i += 1 else: i = self._lyx2xml(i, end) return i def _handle_interspersed_attrs(self, start, end): lines = self.lines xout = self.xout i = start depth = 0 while i < end: if lines[i].startswith('\\begin_') or lines[i].startswith( '\\index '): depth += 1 elif lines[i].startswith('\\end_'): depth -= 1 elif depth == 0 and lines[i].startswith('\\'): (a, v) = _parse_attr(lines[i]) if a and v: xout.attr(a, v) if a == 'language': # Set the default language; needed for the style # tag fix code. mixed_tags['lang'] = v self.dbg(2, 'Default language is %s' % (v, )) i += 1 def _lyx2xml(self, start=0, end=-1, cmd_type=None): lines = self.lines xout = self.xout i = start if end < 0: end = len(lines) prev_i = -1 while i < end: self.dbg(3, 'Parsing line %d: %s' % (i, lines[i])) assert i > prev_i prev_i = i if len(lines[i]) == 0 or lines[i] == ' ': # Ignore empty lines i += 1 cmd_type = None elif lines[i][0] == '#': # LyX source comment #xout.comment(lines[i][1:]) i += 1 cmd_type = None elif _beginner(lines[i]): (el, start_tok, end_tok, cmd_type, rest) = _parse_begin(lines[i]) xout.start_elt(el) if el == 'document': # Default XML namespace xout.attr('xmlns', 'urn:cryptonector.com:lyx-other') # We put layouts, insets, and custome insets into # separate namespaces xout.attr('xmlns:layout', 'urn:cryptonector.com:lyx-layout') xout.attr('xmlns:inset', 'urn:cryptonector.com:lyx-inset') xout.attr('xmlns:flex', 'urn:cryptonector.com:lyx-flex') # XXX MathML not yet implemented; we get to turn LyX # formulas into MathML, joy! But it looks like # LaTeX math, and there's tools for converting that # to MathML, so, hey, it might not be too much work. xout.attr('xmlns:math', 'http://www.w3.org/1998/Math/MathML') # Suck in \lyxformat, which precedes \begin_document, # and anything else that might be there (though # nothing will be). self._handle_interspersed_attrs(0, i) if (el.startswith('inset:') or el.startswith('flex:')) and \ not cmd_type and (i + 1) < end and \ lines[i + 1].startswith('status '): i += 1 # skip status open|collapsed line status = _chomp(lines[i][lines[i].find(' ') + 1:]) else: status = None self.dbg(4, 'lines[%d] = %s' % (i, lines[i])) e = find_end_of(lines, i, start_tok, end_tok) assert e != -1 self.dbg( 4, 'find_end_of(%d, %s, %s) = %d' % (i, start_tok, end_tok, e)) # XXX Here we need to find any attributes that might be # interspersed with child nodes so we can suck them in # first. What a PITA. self._handle_interspersed_attrs(i + 1, e - 1) if status: xout.attr('status', status) if len(rest) == 2 and el != 'inset:Formula': xout.attr(rest[0], escape(rest[1])) i = self._lyx2xml(i + 1, e, cmd_type) self.dbg( 4, '_lyx2xml(...) = %d, looking for %s at %d; end = %d' % (i, end_tok, e, end)) if i + 1 == e: i += 1 else: self.dbg( 4, '_lyx2xml() returned %d, e = %d; end = %d' % (i, e, end)) assert lines[i].startswith('\\end_') if len(rest) == 2 and el == 'inset:Formula': xout.text(' ') xout.text(escape(rest[1])) xout.end_elt(el) cmd_type = None i += 1 elif cmd_type == 'inset': # Parse "\begin_inset CommandInset ..." attributes self.dbg(4, 'lines[%d] = %s' % (i, lines[i])) while i < end and lines[i] != '' and lines[i] != ' ': (a, v) = _parse_attr(lines[i]) if not a or not v: break xout.attr(a, v) i += 1 # then suck in content cmd_type = None elif cmd_type == 'XML': # Parse embedded XML contents i = self._lyxml2xml(i, end) cmd_type = None elif lines[i][0] == '\\' and \ not lines[i].startswith('\\begin_') and \ not lines[i].startswith('\\end_') and \ not _key(lines[i]) in mixed_tags: # An attribute, which we've handled above with the call to # _handle_interspersed_attrs(). i += 1 else: line = lines[i] if xout.stack[-1] == 'layout': line = _chomp(line) key = _key(line) if key in mixed_tags: val = _chomp(lines[i][lines[i].find(' ') + 1:]) if val == mixed_tags[key]: xout.end_elt(key) else: xout.start_elt(key) xout.attr('type', val) else: xout.text(escape(line)) cmd_type = None i += 1 return i