def build_webassets(output_dir): with open('1_KING_HENRY_IV_rev.txt', 'r') as input_file: text_contents = input_file.read() tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) tagger = DocuscopeTagger(return_included_tags=True) tags = tagger.tag(tokens) formatter = HTMLFormatter() formatter._build_webassets() html = formatter.format_paginated(tags=tags, tokens=tokens, text_name="1_KING_HENRY_IV_rev.txt", text_relative_path="", processing_id="") with open(os.path.join(output_dir, 'Ubiqu+Ity_1_KING_HENRY_IV_Docuscope_Example_Output.html'), 'w') as output_file: output_file.write(html)
def format_ds(input_file): """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity DocuscopeTagger, in string form""" with open(input_file, 'r') as f: text_contents = f.read() tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) tagger = DocuscopeTagger(return_included_tags=True) tags = tagger.tag(tokens) # do an ugly hack to fix lat names for t in tags[1]: new_tag = list(t['rules'][0]) new_tag[0] = new_tag[0].rsplit('.')[-1] new_rules = list(t['rules']) new_rules.pop(0) new_rules.insert(0, new_tag) t['rules'] = tuple(new_rules) formatter = LATFormatter.LATFormatter() return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)