示例#1
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	return output_string
示例#2
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
示例#3
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = ("─┮\n"
                     " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
                     " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
                     "   ┡─╼ : _ _\n"
                     "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
                     "     ┡─╼ i _ LId=i-1\n"
                     "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
                     "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False, attributes='form,feats,misc',
                               print_sent_id=False, print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type
示例#4
0
def fix_punct(conllu_string):
	# Protect possessive apostrophe from being treated as punctuation
	conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE)  # remove udapi sent_id
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = output_string.replace('&udapi_apos;',"'")
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
示例#5
0
    def execute(self):
        """Parse given scenario and execute it."""

        # Parse the given scenario from the command line.
        block_names, block_args = _parse_command_line_arguments(
            self.args.scenario)

        # Import blocks (classes) and construct block instances.
        blocks = _import_blocks(block_names, block_args)

        # Initialize blocks (process_start).
        for block in blocks:
            block.process_start()

        readers = []
        for block in blocks:
            try:
                block.finished  # pylint: disable=pointless-statement
                readers.append(block)
            except AttributeError:
                pass
        if not readers:
            logging.info('No reader specified, using read.Conllu')
            conllu_reader = Conllu()
            readers = [conllu_reader]
            blocks = readers + blocks

        # Apply blocks on the data.
        finished = False
        filenames_iterator = 0  # !!! ADDED !!!
        while not finished:
            document = Document()
            logging.info(" ---- ROUND ----")
            for block in blocks:
                if (filenames_iterator < len(block.filenames)):  # !!!
                    filename = block.filenames[filenames_iterator]  # !!!
                    document.set_filename(filename)  # ADDED
                    filenames_iterator += 1  # !!!
                logging.info("Executing block " + block.__class__.__name__)
                block.before_process_document(document)
                result = block.process_document(document)
                if (type(result) == int):
                    init_cluster_id = result
                block.after_process_document(document)

            finished = True

            for reader in readers:
                finished = finished and reader.finished

        # 6. close blocks (process_end)
        for block in blocks:
            block.process_end()
示例#6
0
def process_doc(book_list, outfile_name):
    ordered_doc = Document()

    for book in book_list:
        for _, sent in tree_dic[book]:
            bund = ordered_doc.create_bundle()
            bund.add_tree(sent)

    for block in blocks:
        block.apply_on_document(ordered_doc)

    if outfile_name:
        ordered_doc.store_conllu(outfile_name)
示例#7
0
    def test_deps_getter(self):
        """Test enhanced dependencies."""
        # Create a path to the test CoNLLU file.
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')

        # Read a test CoNLLU file.
        document = Document()
        reader = Conllu(files=data_filename)
        reader.process_document(document)

        # Exactly one bundle should be loaded.
        self.assertEqual(len(document.bundles), 1)

        # Obtain the dependency tree and check its sentence ID.
        root = document.bundles[0].get_tree()
        self.assertEqual(root.bundle.bundle_id, 'a-mf920901-001-p1s1A')

        # Check raw secondary dependencies for each node.
        nodes = root.descendants()
        self.assertEqual(nodes[0].raw_deps, '0:root|2:amod')
        self.assertEqual(nodes[1].raw_deps, '0:root')
        self.assertEqual(nodes[2].raw_deps, '0:root')
        self.assertEqual(nodes[3].raw_deps, '0:root')
        self.assertEqual(nodes[4].raw_deps, '1:amod')
        self.assertEqual(nodes[5].raw_deps, '5:conj')

        # Check deserialized dependencies.
        self.assertEqual(nodes[0].deps[0]['parent'], root)
        self.assertEqual(nodes[0].deps[0]['deprel'], 'root')
        self.assertEqual(nodes[5].deps[0]['parent'], nodes[4])
示例#8
0
 def setUpClass(cls):
     cls.doc = Document()
     cls.data = os.path.join(os.path.dirname(udapi.__file__), "core",
                             "tests", "data", "enh_deps.conllu")
     cls.doc.load_conllu(cls.data)
     cls.tree = cls.doc.bundles[0].get_tree()
     cls.nodes = cls.tree.descendants
     cls.add_empty_node(cls.tree, 3)
示例#9
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)),
                         3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        self.assertTrue(nodes[0].precedes(nodes[1]))
        self.assertTrue(nodes[0] < nodes[1])
        self.assertFalse(nodes[0] > nodes[1])
        self.assertTrue(nodes[0] <= nodes[0])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()],
                         [1, 2, 3, 4, 5, 6])
示例#10
0
 def setUpClass(cls):
     cls.doc = Document()
     cls.data = os.path.join(
         os.path.dirname(tb2ud.__file__),
         "../test/data/tlg0011.tlg005.daphne_tb-grc1.xml")
     reader = AgldtReader(cls.data)
     reader.apply_on_document(cls.doc)
     print(len(cls.doc.bundles))
     cls.tree = cls.doc.bundles[263].get_tree()
     cls.nodes = cls.tree.descendants
示例#11
0
 def setUpClass(cls):
     cls.doc = Document()
     cls.data = os.path.join(os.path.dirname(tb2ud.__file__),
                             "../test/data/artificials.conllu")
     cls._reader = ConlluReader(files=cls.data)
     cls._reader.apply_on_document(cls.doc)
     # cls.tree = cls.doc.bundles[0].get_tree()
     # cls.nodes = cls.tree.descendants
     cls.writer = ConlluWriter()
     cls._subtreeconverted = False
示例#12
0
def fix_punct(conllu_string):
    conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)",
                           r'\t&udapi_apos;\t\1',
                           conllu_string,
                           flags=re.MULTILINE)
    conllu_string = re.sub(
        r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])',
        r'\t&udapi_quot;\t\1',
        conllu_string,
        flags=re.MULTILINE)
    doc = UdapiDocument()
    doc.from_conllu_string(conllu_string)
    fixpunct_block = FixPunct()
    fixpunct_block.process_document(doc)
    output_string = doc.to_conllu_string()
    output_string = output_string.replace('&udapi_apos;',
                                          "'").replace('&udapi_quot;', '"')
    output_string = re.sub(r'# sent_id = [0-9]+\n', r'',
                           output_string)  # remove udapi sent_id
    return output_string
示例#13
0
def main():
    doc = Document()
    doc.from_conllu_string(conllu_string1)
    tree = doc.bundles[0].get_tree()
    nodes = tree.descendants
    writer = ConlluWriter()

    # Shifter
    shifter = ShiftArtificials()
    shifter.apply_on_document(doc)

    # writer.apply_on_document(doc)

    # Converter
    converter = SubTreeConverter(with_enhanced=True)
    converter.apply_on_document(doc)

    print(len(tree.empty_nodes))

    # Writer
    writer.apply_on_document(doc)
示例#14
0
def extract_senseid_children_collocates(conllu_filename):
    D = Document()
    D.load_conllu(conllu_filename
                  )  #'Chinese_train_pos.xml.utf8.sentences.conllu.senseid')
    target_senseid_deprel_form_bundles = Vividict()  #defaultdict(dict)
    for bundle in D.bundles:
        setattr_words(bundle=bundle)
        node = bundle.get_tree()
        while node:
            target = node.form
            senseid = node.misc['senseid']
            if senseid:  # For a verb like 想, list all children of the sense node:
                for child in node.children:
                    if target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form] == {}:
                        target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form] = [bundle]
                    else:
                        target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form].append(bundle)
            node = node.next_node
    # To convert back to a common dictionaryu instance:
    d = dict(target_senseid_deprel_form_bundles)
    for target, senseid_deprel_form_bundles in target_senseid_deprel_form_bundles.items(
    ):
        d[target] = dict(senseid_deprel_form_bundles)
        for senseid, deprel_form_bundles in senseid_deprel_form_bundles.items(
        ):
            d[target][senseid] = dict(deprel_form_bundles)
            for deprel, form_bundles in deprel_form_bundles.items():
                #d[target][senseid][deprel]=dict(form_bundles)
                sorted_form_bundles = sorted(
                    form_bundles.items(),
                    key=lambda form_bundles: len(form_bundles[1]),
                    reverse=True)
                d[target][senseid][deprel] = OrderedDict(sorted_form_bundles)
    return d
示例#15
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
示例#16
0
    def execute(self):
        """Parse given scenario and execute it."""

        # Parse the given scenario from the command line.
        block_names, block_args = _parse_command_line_arguments(
            self.args.scenario)

        # Import blocks (classes) and construct block instances.
        blocks = _import_blocks(block_names, block_args)

        # Initialize blocks (process_start).
        for block in blocks:
            block.process_start()

        readers = []
        for block in blocks:
            try:
                block.finished  # pylint: disable=pointless-statement
                readers.append(block)
            except AttributeError:
                pass
        if not readers:
            logging.info('No reader specified, using read.Conllu')
            conllu_reader = Conllu()
            readers = [conllu_reader]
            blocks = readers + blocks

        # Apply blocks on the data.
        finished = False
        while not finished:
            document = Document()
            logging.info(" ---- ROUND ----")
            for block in blocks:
                logging.info("Executing block " + block.__class__.__name__)
                block.apply_on_document(document)

            finished = True

            for reader in readers:
                finished = finished and reader.finished

        # 6. close blocks (process_end)
        for block in blocks:
            block.process_end()
示例#17
0
def load():
    from udapi.core.document import Document
    load, read, write, text, relchain, save = [], [], [], [], [], []
    for _ in range(30):
        start = timeit.default_timer()
        document = Document()
        document.load_conllu('cs-ud-train-l.conllu')
        end = timeit.default_timer()
        load.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                for node in root.descendants:
                    form_lemma = node.form + node.lemma
        end = timeit.default_timer()
        read.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                chain = [n for n in root.descendants if n.deprel == "case" and n.parent.deprel == "nmod"]
        end = timeit.default_timer()
        relchain.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                for node in root.descendants:
                    node.deprel = 'dep'
        end = timeit.default_timer()
        write.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                root.compute_text()
        end = timeit.default_timer()
        text.append(end - start)

        start = timeit.default_timer()
        document.store_conllu('hello.conllu')
        end = timeit.default_timer()
        save.append(end - start)

    for x, y in [('load', load), ('read', read), ('write', write), ('text', text), ('relchain', relchain), ('save', save)]:
        print("{}\t{} +/- {}".format(x, round(np.mean(y), 2), round(np.std(y), 2)))
示例#18
0
def load():
    from udapi.core.document import Document
    document = Document()
    document.load_conllu('cs-ud-train-l.conllu')

    for bundle in document:
        for root in bundle:
            for node in root.descendants:
                form_lemma = node.form + node.lemma

    for bundle in document:
        for root in bundle:
            chain = [n for n in root.descendants if n.parent.deprel == "det" and n.parent.parent.deprel == "obj"]

    for bundle in document:
        for root in bundle:
            for node in root.descendants:
                node.deprel = 'dep'

    for bundle in document:
        for root in bundle:
            root.compute_text()

    document.store_conllu('hello.conllu')
 def test_iterator(self):
     doc = Document()
     doc.bundles = ['a', 'b', 'c']
     for bundle in doc:
         print(bundle)
示例#20
0
    if outfile_name:
        ordered_doc.store_conllu(outfile_name)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("infile", help="Input file")

    parser.add_argument('-a', '--all', action='store_true', help='create all the 6-book chunks possible')
    parser.add_argument('-s', '--start', type=int, default=1, help='Starting book')
    parser.add_argument('-e', '--end', type=int, default=24, help='Ending book')
    parser.add_argument('-o', '--out', help='Output file')
    args = parser.parse_args()
    # outname = args.out

    doc = Document()
    reader = AgldtReader(args.infile, fix_cycles=True)
    reader.apply_on_document(doc)
    trees = [b.get_tree() for b in doc.bundles]

    if args.all:
        start = 1
        while 1:
            stop = start + 5
            if stop > 24:
                break
            else:
                tree_dic = get_ordered_trees(trees, start, stop)
                book_list = sorted(tree_dic.keys())
                outf = args.infile.replace('.tb.xml', f'.{start}-{stop}.tb.conllu')
                process_doc(book_list, outf)
 def test_init(self):
     doc = Document()
示例#22
0
from udapi.core.document import Document

D = Document()
D.load_conllu(
    'SemEval-2007/Chinese_train_pos.xml.utf8.sentences.conllu.senseid')

for bundle in D.bundles:
    bundle.words = []
    node = bundle.get_tree()
    while node:
        bundle.words.append(node.form)
        node = node.next_node
    print(bundle.bundle_id, bundle.words)
示例#23
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = (
            "─┮\n"
            " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
            " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
            "   ┡─╼ : _ _\n"
            "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
            "     ┡─╼ i _ LId=i-1\n"
            "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
            "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False,
                               attributes='form,feats,misc',
                               print_sent_id=False,
                               print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False,
                                attributes='form',
                                print_sent_id=0,
                                print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type
示例#24
0
maxseed = 2**32;
def myrand(modulo):
    global seed
    seed = (1103515245 * seed + 12345) % maxseed;
    return seed % modulo;

debug = False
if sys.argv[1] == "-d":
    debug = True
    sys.argv.pop(1)
in_conllu = sys.argv[1]
out_conllu = sys.argv[2]

print("init")

doc = Document()
doc.load({'filename':in_conllu})

print("load")
if debug: doc.store({'filename':'udapi-load.conllu'})

for bundle in doc:
    for root in bundle:
        for node in root.descendants():
            pass

print("iter")

for bundle in doc:
    for root in bundle:
        for node in root.descendants():
This script takes an AGDT xml file and generate a half-baked CONLL-U, right before the SetArtificial stage.
In this way, we create a test set to verify the problems in the SetArtificial stage.
"""

from udapi.core.document import Document
from udapi.block.agldt.setspaceafter import SetSpaceAfter
from udapi.block.read.agldt import Agldt as AgldtReader
from tb2ud import *
from tb2ud.text.updatetext import UpdateText
from tb2ud.postprocess.fixsomepos import FixSomePos
from collections import defaultdict
import re

tst_file = "./data/artificial_sentences.xml"

doc = Document()
reader = AgldtReader(tst_file, fix_cycles=True)
reader.apply_on_document(doc)
#trees = [b.get_tree() for b in doc.bundles]

blocks = [
    SetSpaceAfter(),
    CreateUpos(),
    CreateFeats(),
    SetMember(),
    ShallowConverter(),
    ShiftArtificials(),
    SubTreeConverter(with_enhanced=True),
    FixObj(),
    # SetArtificials(), MakeEnhanced(), # COMMENT OUT if you DO NOT want empty nodes and enhanced deps
    RehangPunct(),
示例#26
0
 def test_iterator(self):
     doc = Document()
     doc.bundles = ['a', 'b', 'c']
     for bundle in doc:
         print(bundle)
示例#27
0
        if book_start <= int(bk) <= book_end:
            d[int(bk)].append((int(ln), tree))
    print("reordering the dictionary")
    for k in d.keys():
        d[k].sort(key=lambda x: x[0])
    return d


parser = argparse.ArgumentParser()
parser.add_argument("infile", help="Input file")
parser.add_argument('-s', '--start', type=int, default=1, help='Starting book')
parser.add_argument('-e', '--end', type=int, default=24, help='Ending book')
parser.add_argument('-o', '--out', help='Output file')
args = parser.parse_args()

doc = Document()
reader = AgldtReader(args.infile, fix_cycles=True)
reader.apply_on_document(doc)
trees = [b.get_tree() for b in doc.bundles]
tree_dic = get_ordered_trees(trees, args.start, args.end)
book_list = sorted(tree_dic.keys())

ordered_doc = Document()

for book in book_list:
    for _, sent in tree_dic[book]:
        bund = ordered_doc.create_bundle()
        bund.add_tree(sent)

outname = args.out
示例#28
0
 def from_connlu(conllu):
     doc = Document()
     doc.from_conllu_string(conllu)
     return doc