def process_doc(book_list, outfile_name): ordered_doc = Document() for book in book_list: for _, sent in tree_dic[book]: bund = ordered_doc.create_bundle() bund.add_tree(sent) for block in blocks: block.apply_on_document(ordered_doc) if outfile_name: ordered_doc.store_conllu(outfile_name)
def load(): from udapi.core.document import Document load, read, write, text, relchain, save = [], [], [], [], [], [] for _ in range(30): start = timeit.default_timer() document = Document() document.load_conllu('cs-ud-train-l.conllu') end = timeit.default_timer() load.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma end = timeit.default_timer() read.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.deprel == "case" and n.parent.deprel == "nmod"] end = timeit.default_timer() relchain.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' end = timeit.default_timer() write.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: root.compute_text() end = timeit.default_timer() text.append(end - start) start = timeit.default_timer() document.store_conllu('hello.conllu') end = timeit.default_timer() save.append(end - start) for x, y in [('load', load), ('read', read), ('write', write), ('text', text), ('relchain', relchain), ('save', save)]: print("{}\t{} +/- {}".format(x, round(np.mean(y), 2), round(np.std(y), 2)))
def load(): from udapi.core.document import Document document = Document() document.load_conllu('cs-ud-train-l.conllu') for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.parent.deprel == "det" and n.parent.parent.deprel == "obj"] for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' for bundle in document: for root in bundle: root.compute_text() document.store_conllu('hello.conllu')
from collections import defaultdict import re tst_file = "./data/artificial_sentences.xml" doc = Document() reader = AgldtReader(tst_file, fix_cycles=True) reader.apply_on_document(doc) #trees = [b.get_tree() for b in doc.bundles] blocks = [ SetSpaceAfter(), CreateUpos(), CreateFeats(), SetMember(), ShallowConverter(), ShiftArtificials(), SubTreeConverter(with_enhanced=True), FixObj(), # SetArtificials(), MakeEnhanced(), # COMMENT OUT if you DO NOT want empty nodes and enhanced deps RehangPunct(), FixSomePos(), PurgeMisc(), UpdateText() ] for block in blocks: block.apply_on_document(doc) doc.store_conllu("./data/non_transformed_artificials.conllu")
for book in book_list: for _, sent in tree_dic[book]: bund = ordered_doc.create_bundle() bund.add_tree(sent) outname = args.out blocks = [ SetSpaceAfter(), CreateUpos(), CreateFeats(), SetMember(), ShallowConverter(), ShiftArtificials(), SubTreeConverter(with_enhanced=True), FixObj(), SetArtificials(), MakeEnhanced( ), # COMMENT OUT if you DO NOT want empty nodes and enhanced deps RehangPunct(), FixSomePos(), PurgeMisc(), UpdateText() ] for block in blocks: block.apply_on_document(ordered_doc) if outname: ordered_doc.store_conllu(outname)