if cmdline.disable: ltcommand += ['--disable', cmdline.disable] if cmdline.lt_options: ltcommand += cmdline.lt_options[1:].split() ltcommand += ['-'] if cmdline.lt_server_options: ltserver_local_cmd += ' ' + cmdline.lt_server_options[1:] # on option --include: add included files to work list # otherwise: remove duplicates # if cmdline.include: sys.stderr.write('=== checking for file inclusions ... ') sys.stderr.flush() opts = tex2txt.Options(extr=inclusion_macros, repl=cmdline.replace, defs=cmdline.define, lang=cmdline.t2t_lang) def skip_file(fn): # does file name match regex from option --skip? return cmdline.skip and re.search(r'\A' + cmdline.skip + r'\Z', fn) todo = cmdline.file done = [] while todo: f = todo.pop(0) if f in done or skip_file(f): continue done.append(f)
\textcolor{red}{redx colour.}} is lazy. """ plain_t = r""" Only few people is lazy. We use redx colour. """ nums_t = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 27, 27, 27, 27, 28, 29, 30, 31, 32, 33, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 74 ] options = tex2txt.Options(lang='en', char=True) plain, nums = tex2txt.tex2txt(latex, options) def test_text(): assert plain == plain_t def test_nums(): assert nums == nums_t
# input_encoding = 'utf-8' # input_encoding = 'latin-1' # path of LT java archive and used options # ltjar = '../LT/LanguageTool-4.7/languagetool-commandline.jar' ltcmd = ('java -jar ' + ltjar + ' --language en-GB --encoding utf-8' + ' --disable WHITESPACE_RULE').split() # prepare options for tex2txt() # options = tex2txt.Options( char=True, # repl=tex2txt.read_replacements('Tools/LT/repls.txt', # encoding=input_encoding), # defs=tex2txt.read_definitions('Tools/LT/defs.py', # encoding='utf-8'), lang='en') for file in sys.argv[1:]: sys.stderr.write('=== ' + file + '\n') sys.stderr.flush() # read file and call tex2txt() # f = tex2txt.myopen(file, encoding=input_encoding) tex = f.read() f.close() (plain, charmap) = tex2txt.tex2txt(tex, options)