def build_webassets(output_dir): with open('1_KING_HENRY_IV_rev.txt', 'r') as input_file: text_contents = input_file.read() tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) tagger = DocuscopeTagger(return_included_tags=True) tags = tagger.tag(tokens) formatter = HTMLFormatter() formatter._build_webassets() html = formatter.format_paginated(tags=tags, tokens=tokens, text_name="1_KING_HENRY_IV_rev.txt", text_relative_path="", processing_id="") with open(os.path.join(output_dir, 'Ubiqu+Ity_1_KING_HENRY_IV_Docuscope_Example_Output.html'), 'w') as output_file: output_file.write(html)
def setUp(self): self.tags = { 'key1': { 'name': '1', 'full_name': 'key1' }, 'key2': { 'name': '2', 'full_name': 'key2' }, 'key3': { 'name': '3', 'full_name': 'key3' } } self.formatter = HTMLFormatter() self.prepared_tags = self.formatter.prepare_tags(self.tags)
def setUp(self): self.formatter = HTMLFormatter() self.text = "So shaken as we are, so wan with care" self.tokens = [[['So'], 0, 2, 0], [[' '], 2, 1, 2], [['shaken'], 3, 6, 0], [[' '], 9, 1, 2], [['as'], 10, 2, 0], [[' '], 12, 1, 2], [['we'], 13, 2, 0], [[' '], 15, 1, 2], [['are'], 16, 3, 0], [[','], 19, 1, 1], [[' '], 20, 1, 2], [['so'], 21, 2, 0], [[' '], 23, 1, 2], [['wan'], 24, 3, 0], [[' '], 27, 1, 2], [['with'], 28, 4, 0], [[' '], 32, 1, 2], [['care'], 33, 4, 0]] self.tags = [ None, [{ 'index_end': 2, 'rules': [('DocuscopeTagger..default.(2, 3).True.EXCL_!UNTAGGED.EXCL_!NORULES.EXCL_!EXCLUDED.Transformation', 'shaken')], 'token_end_len': 6, 'len': 1, 'num_included_tokens': 1, 'index_start': 2, 'pos_end': 3, 'pos_start': 3 }, { 'index_end': 8, 'rules': [('DocuscopeTagger..default.(2, 3).True.EXCL_!UNTAGGED.EXCL_!NORULES.EXCL_!EXCLUDED.ReportingStates', ('we', 'are'))], 'token_end_len': 3, 'len': 1, 'num_included_tokens': 2, 'index_start': 6, 'pos_end': 16, 'pos_start': 13 }, { 'index_end': 11, 'rules': [('DocuscopeTagger..default.(2, 3).True.EXCL_!UNTAGGED.EXCL_!NORULES.EXCL_!EXCLUDED.ReasonForward', (',', 'so'))], 'token_end_len': 2, 'len': 1, 'num_included_tokens': 2, 'index_start': 9, 'pos_end': 21, 'pos_start': 19 }, { 'index_end': 13, 'rules': [('DocuscopeTagger..default.(2, 3).True.EXCL_!UNTAGGED.EXCL_!NORULES.EXCL_!EXCLUDED.Negativity', 'wan')], 'token_end_len': 3, 'len': 1, 'num_included_tokens': 1, 'index_start': 13, 'pos_end': 24, 'pos_start': 24 }, { 'index_end': 17, 'rules': [('DocuscopeTagger..default.(2, 3).True.EXCL_!UNTAGGED.EXCL_!NORULES.EXCL_!EXCLUDED.StandardsPos', ('with', 'care'))], 'token_end_len': 4, 'len': 1, 'num_included_tokens': 2, 'index_start': 15, 'pos_end': 33, 'pos_start': 28 }] ] self.prepared_tokens = self.formatter.prepare_tokens( tokens=self.tokens, tags=self.tags) self.tag_map = { 1: 'Transformation', 3: 'ReportingStates', 4: 'ReasonForward', 6: 'Negativity', 8: 'StandardsPos' } self.pos_map = { 0: 0, 1: 3, 2: 9, 3: 13, 4: 19, 5: 23, 6: 24, 7: 27, 8: 28 }
def setUp(self): self.pages = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] self.page_size = 3 self.formatter = HTMLFormatter() self.paginated = self.formatter.paginate(self.pages, self.page_size)