Пример #1
0
def tag_dir(model, input_dir, output_dir, tokenized_input, string=None, **kwargs):
    """ Tag a directory of texts

    :param model: Path to a model file
    :param input_dir: Path to a directory containing text files
    :param output_dir: Path to output tagged text files
    """
    print('::: started :::')

    tagger = Tagger(load=True, model_dir=model, overwrite=kwargs)
    print('Tagger loaded, now annotating...')

    orig_path = input_dir
    new_path = output_dir

    for filename in os.listdir(orig_path):
        if not filename.endswith('.txt'):
            continue

        print('\t +', filename)
        unseen_tokens = pandora.utils.load_unannotated_file(
            orig_path + filename,
            nb_instances=None,
            tokenized_input=tokenized_input
        )

        annotations = tagger.annotate(unseen_tokens)
        keys = list(annotations.keys())
        print("Keys :" + "\t".join(keys))
        with codecs.open(new_path + filename + ".tsv", 'w', 'utf8') as f:
            f.write("\t".join(keys) + "\n")
            for x in zip(*tuple([annotations[k] for k in keys])):
                f.write('\t'.join(list(x)) + '\n')

    print('::: ended :::')
Пример #2
0
def main():
    print('::: started :::')
    
    tagger = Tagger(load=True, model_dir='models/wilhelmus_full')

    print('Tagger loaded, now annotating...')

    orig_path = 'data/wilhelmus/orig/'
    new_path = 'data/wilhelmus/tagged/'

    for filename in os.listdir(orig_path):
        if not filename.endswith('.txt'):
            continue

        print('\t +', filename)
        unseen_tokens = pandora.utils.load_unannotated_file(orig_path + filename,
                                                         nb_instances=None,
                                                         tokenized_input=False)

        annotations = tagger.annotate(unseen_tokens)
        with codecs.open(new_path + filename, 'w', 'utf8') as f:
            #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']):
            for t, l, p in zip(annotations['tokens'], annotations['lemmas'], annotations['pos']):
            #for t, l in zip(annotations['tokens'], annotations['lemmas']):
                f.write('\t'.join((t, l, p))+'\n')
    
    print('::: ended :::')
Пример #3
0
def tag_string(model, input_dir, output_dir=None, string=None, **kwargs):
    """ Tag a directory of texts

    :param model: Path to a model file
    :param input_dir: Untokenized string to tag
    """

    print('::: started :::')

    tagger = Tagger(load=True, model_dir=model, overwrite=kwargs)

    print('Tagger loaded, now annotating...')

    unseen_tokens = tokenize.split(input_dir)
    print(unseen_tokens)

    annotations = tagger.annotate(unseen_tokens)

    keys = list(annotations.keys())
    print("--------------------")
    print('\t'.join(keys))
    print("--------------------")
    for x in zip(*tuple([annotations[k] for k in keys])):
        print('\t'.join(list(x)))

    print('::: ended :::')
Пример #4
0
def main():
    print('::: started :::')

    tagger = Tagger(load=True, model_dir='models/wilhelmus_full')

    print('Tagger loaded, now annotating...')

    orig_path = 'data/wilhelmus/orig/'
    new_path = 'data/wilhelmus/tagged/'

    for filename in os.listdir(orig_path):
        if not filename.endswith('.txt'):
            continue

        print('\t +', filename)
        unseen_tokens = pandora.utils.load_unannotated_file(
            orig_path + filename, nb_instances=None, tokenized_input=False)

        annotations = tagger.annotate(unseen_tokens)
        with codecs.open(new_path + filename, 'w', 'utf8') as f:
            #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']):
            for t, l, p in zip(annotations['tokens'], annotations['lemmas'],
                               annotations['pos']):
                #for t, l in zip(annotations['tokens'], annotations['lemmas']):
                f.write('\t'.join((t, l, p)) + '\n')

    print('::: ended :::')