예제 #1
0
def get_nonprojectivity_ratios():
    lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False)

    np = dict()
    tot = dict()
    for lang, path in lang_trees.items():
        trees = list(udtree.from_files(path))
        tot[lang] = len(trees)
        np[lang] = get_np_trees(trees)

    rel = {}
    for lang, n in zip(np.keys(), np.values()):
        rel[lang] =  len(n) / tot[lang]
    return rel
#!/usr/local/bin/python3
# TODO: Convert conll-u to conll-x and then back again

# To run MaltOptimizer, you have to be in the MaltOptimizer directory:
# cd {project_base}/tools/MaltOptimizer-1.0.3
# ~/dev/miniconda/bin/python3 ../train_maltparser.py

from subprocess import call
from os.path import join
from os import rename
import lang_utils

treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/"
train_files = lang_utils.get_ud_paths(treebank_base, type_="train", format_="conllx")
train_files = {"Czech": train_files['Czech']}
project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/"

maltparser_path = join(project_base, "tools",
                       "maltparser-1.8.1", "maltparser-1.8.1.jar")
maltoptimizer_path = join(project_base, "tools", "MaltOptimizer-1.0.3", "MaltOptimizer.jar")

base_cmd = ["java", "-Xmx8G"]  # 8G is only necessary for Czech
jar_path = ["-jar", maltparser_path]
mode = ["-m", "learn"]

for lang, train_file in train_files.items():
    print("Training language {}".format(lang))
    training_path = ["-i", train_file[0]]
    model_path = ["-c", "ud-1.2." + lang]
    call(base_cmd + jar_path + ["-grl", "root"] + model_path + mode + training_path)
# TODO: Convert conll-u to conll-x and then back again

from subprocess import call
from os.path import join
from os import remove
from shutil import copyfile
import lang_utils

project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/"
treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/"
test_files = lang_utils.get_ud_paths(treebank_base, type_="dev", format_="conllx")
maltparser_path = join(project_base, "tools",
                       "maltparser-1.8.1", "maltparser-1.8.1.jar")

base_cmd = ["java", "-Xmx8G"]
jar_path = ["-jar", maltparser_path]
mode = ["-m", "parse"]

for lang, test_file in test_files.items():
    # Get language's parsing model
    print("Parsing {} from {}".format(lang, test_file))
    model_path = ["-c", "ud-1.2." + lang]
    copyfile(join(project_base, "resources", "maltdefault_models_1-2", model_path[1] + ".mco"), model_path[1] + ".mco")
    test_path = ["-i", join(treebank_base, lang, test_file[0])]
    output_path = ["-o", join(project_base, "resources", "maltdefault_output_dev_1-2", lang + ".conllx")]
    call(base_cmd + jar_path + model_path + mode + test_path + output_path)
    remove(model_path[1] + ".mco")

# java -jar maltparser-1.8.1.jar -c es-model -m learn -i ../../resources/universaldependencies1-1/ud-treebanks-v1.1/UD_Spanish/es-ud-dev.conllx
예제 #4
0
    for i, head in zip(tree.ids, tree.heads):
        if head-1 < 0:
            continue
        else:
            for j, inner_head in zip(tree.ids[i:head-1], tree.heads[i:head-1]):
                if inner_head < i or inner_head > head:
                    is_non_projective = True
    return is_non_projective

def get_np_trees(trees):
    np = []
    for tree in trees:
        if is_non_projective(tree):
            np.append(tree)

    return np

def get_non-projectivity_ratios():
    lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False)

    np = dict()
    tot = dict()
    for lang, path in lang_trees.items():
        trees = list(udtree.from_files(path))
        tot[lang] = len(trees)
        np[lang] = get_np_trees(trees)

    rel = {}
    for lang, n in zip(np.keys(), np.values()):
        rel[lang] =  len(n) / tot[lang]
# TODO: Convert conll-u to conll-x and then back again

from subprocess import call
from os.path import join
from os import remove
from shutil import copyfile
import lang_utils

project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/"
treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/"
test_files = lang_utils.get_ud_paths(treebank_base, type_="test", format_="conllx", coarse=True)
maltparser_path = join(project_base, "tools",
                       "maltparser-1.8.1", "maltparser-1.8.1.jar")

base_cmd = ["java", "-Xmx8G"]
jar_path = ["-jar", maltparser_path]
mode = ["-m", "parse"]

for lang, test_file in test_files.items():
    # Get language's parsing model
    print("Parsing {} from {}".format(lang, test_file))
    model_path = ["-c", "ud-1.2." + lang]
    copyfile(join(project_base, "resources", "maltdefault_coarse_models_1-2", model_path[1] + ".mco"), model_path[1] + ".mco")
    test_path = ["-i", join(treebank_base, lang, test_file[0])]
    output_path = ["-o", join(project_base, "resources", "maltdefault_coarse_output_test_1-2", lang + ".conllx")]
    call(base_cmd + jar_path + model_path + mode + test_path + output_path)
    remove(model_path[1] + ".mco")

# java -jar maltparser-1.8.1.jar -c es-model -m learn -i ../../resources/universaldependencies1-1/ud-treebanks-v1.1/UD_Spanish/es-ud-dev.conllx
예제 #6
0
def convert(to_convert_files):
    for lang, files in to_convert_files.items():
        file_ending = ".conllx"
        if not fine_grained_deprels:
            file_ending = ".coarse_deprels.conllx"
        if lang == "Czech" and len(files) > 1:
            file_name = "cs-ud-train" + file_ending
        else:
            file_name = files[0].split("/")[-1].split(".")[0] + file_ending

        outfile = join(project_base, "UD_" + lang, file_name)
        trees = udtree.from_files(files)
        with open (outfile, "w") as w:
            for tree in trees:
                for word in tree.sentence_structure:
                    word['deps'] = None
                    word['misc'] = None
                if not any(tree.postags):  # Copy CPOSTAG to POSTAG
                    tree.postags = tree.cpostags
                w.write("\n".join(tree.to_conllx_format(fine_grained_deprels=fine_grained_deprels)) + "\n\n")


train_files = lang_utils.get_ud_paths(project_base, type_="train", format_="conllu", coarse=False)
dev_files = lang_utils.get_ud_paths(project_base, type_="dev", format_="conllu", coarse=False)
test_files = lang_utils.get_ud_paths(project_base, type_="test", format_="conllu", coarse=False)

convert(train_files)
convert(dev_files)
convert(test_files)