def itertokens(self): """read tex tokens, including imported files""" dirname = os.path.dirname(self.filename) tex = TeX(file=self.filename) tokens = tex.itertokens() try: while True: token = next(tokens) if token.data == 'input': tokens = self.add_input(dirname, tokens) elif token.data == 'import': # TODO handle \subimport, and also \import* and \subimport* print("WARNING: we don't handle \\import yet") yield token elif token.data == 'include': # TODO be aware of \includeonly tokens = self.add_input(dirname, tokens) elif token.data == 'newcommand': try: name = read_macro_name(tokens) args_or_def = read_balanced_brackets(tokens) if args_or_def[0] == '[': # n_args n_args = format_n_args(args_or_def) definition = read_balanced_brackets(tokens) else: n_args = 0 definition = args_or_def self.macro_lut[name] = MacroDef( n_args, definition[1:-1]) except: yield token else: for t in maybe_expand_macro(token, tokens, self.macro_lut): yield t except StopIteration: pass
import sys, re from plasTeX.TeX import TeX from utils import pre_tokenize, post_tokenize, BASIC_SKELETON if __name__ == '__main__': if len(sys.argv) != 3: print >> sys.stderr, 'Usage: python %s <tabulars> <tokenized-tabulars>' % sys.argv[ 0] sys.exit(1) with open(sys.argv[1]) as fin: with open(sys.argv[2], 'w') as fout: idx = 0 for line in fin: _, tabular = line.split('\t', 1) idx += 1 #if idx != 11: # continue tex = TeX() tex.input(BASIC_SKELETON % (pre_tokenize(tabular))) #print (pre_tokenize(tabular)) tokens = [token for token in tex.itertokens()] tokens_out = [] #jprint (':'.join(tokens)) tokens_out = post_tokenize(tokens) #print (':'.join(tokens_out)) fout.write(' '.join(tokens_out)[385:-86] + '\n')