Exemplo n.º 1
0
def build_webassets(output_dir):
    with open('1_KING_HENRY_IV_rev.txt', 'r') as input_file:
        text_contents = input_file.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        formatter = HTMLFormatter()
        formatter._build_webassets()
        html = formatter.format_paginated(tags=tags, tokens=tokens, text_name="1_KING_HENRY_IV_rev.txt",
                                               text_relative_path="", processing_id="")

    with open(os.path.join(output_dir, 'Ubiqu+Ity_1_KING_HENRY_IV_Docuscope_Example_Output.html'), 'w') as output_file:
        output_file.write(html)
Exemplo n.º 2
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)
Exemplo n.º 3
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags,
                                tokens=tokens,
                                s=text_contents,
                                input_file=input_file)