def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('--infile', help="conllu file")
    parser.add_argument('--lang', help="")

    args = parser.parse_args()

    #try:

    header = ["proj_pred", "proj_gold", "leaf_viol_pred", "leaf_viol_gold", "posAcc", "UAS"]

    if True:
        vals = []
        rdr = CoNLLReader()
        predicted_sentences = []
        gold_sentences = []

        if args.infile:
            gold_sentences = rdr.read_conll_u_8cols(args.infile)


        numwords = sum([len(s.nodes()[1:]) for s in predicted_sentences])
        #print([int(s.is_fully_projective()) for s in predicted_sentences])

        for idx,s in enumerate(gold_sentences):
            print(idx,s.is_fully_projective())
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
    parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
    parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
    parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
    parser.add_argument('--print_comments',default=False,action="store_true")
    parser.add_argument('--print_fused_forms',default=False,action="store_true")

    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    POSRANKPRECEDENCEDICT = defaultdict(list)
    POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ")
    # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
    POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
    POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
    POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ")

    if args.lang in POSRANKPRECEDENCEDICT:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
    else:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = copy.copy(orig_treebank)

    # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
    # We keep it for future modifications, i.e. any language-specific modules
    for s in modif_treebank:
        # print('sentence', s.get_sentence_as_string(printid=True))
        s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)

    cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
    parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
    parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
    parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
    parser.add_argument('--print_comments',default=False,action="store_true")
    parser.add_argument('--print_fused_forms',default=False,action="store_true")

    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    POSRANKPRECEDENCEDICT = defaultdict(list)
    POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CONJ SCONJ X PUNCT ".split(" ")
    POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
    POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
    POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
    POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET".split(" ")

    if args.lang in POSRANKPRECEDENCEDICT:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
    else:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = copy.copy(orig_treebank)

    # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
    # We keep it for future modifications, i.e. any language-specific modules
    for s in modif_treebank:
        s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)

    cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu')
    parser.add_argument('--input', help="conllu file", default='../..//UD_Catalan/ca-all.conllu')
    parser.add_argument('--output', help="target file", type=Path,default="catout.conllu")
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = []
    for s in orig_treebank:
        s = copy.copy(apply_transform(s,args.lang))
        #if not 'multi_tokens' in s.graph.keys():
        #    print(s.get_sentence_as_string())
        modif_treebank.append(s)
    cio.write_conll(modif_treebank,args.output,conllformat='conllu', print_fused_forms=True,print_comments=True)
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('--infile', help="conllu file", default="/Users/hmartine/proj/eval_multisource/data/2project/watchtower/en.2proj.conll.head1000")

    args = parser.parse_args()

    #try:

    DEPCHILDCOUNTER=Counter()
    GAPDEGREECOUNTER=Counter()
    PROJCOUNTER=Counter()

    header = ["proj_pred", "proj_gold", "leaf_viol_pred", "leaf_viol_gold", "posAcc", "UAS"]
    vals = []
    rdr = CoNLLReader()
    predicted_sentences = []
    gold_sentences = []

    if args.infile:
        gold_sentences = rdr.read_conll_2006_dense(args.infile)

    numwords = sum([len(s.nodes()[1:]) for s in predicted_sentences])
    #print([int(s.is_fully_projective()) for s in predicted_sentences])

    for idx,s in enumerate(gold_sentences):

        local_isproj = s.is_fully_projective()
        localdependentcounter,gapdegreecounter = s.non_projectivity_edge_info()
        PROJCOUNTER.update([local_isproj])
        DEPCHILDCOUNTER+=localdependentcounter
        GAPDEGREECOUNTER+=gapdegreecounter

    projpercent=round(PROJCOUNTER[True]/sum(PROJCOUNTER.values()),2)
    deppercent=[round(DEPCHILDCOUNTER[posname]/sum(DEPCHILDCOUNTER.values()),2) for posname in POSLIST]
    edgelenths = [round(GAPDEGREECOUNTER[l]/sum(GAPDEGREECOUNTER.values()),2) for l in EDGELENGTHS]
    otherlength = round(sum([GAPDEGREECOUNTER[l]/sum(GAPDEGREECOUNTER.values()) for l in GAPDEGREECOUNTER.keys() if l not in EDGELENGTHS]),2)
    #print(Counter(PROJLIST),DEPCHILDCOUNTER.most_common(),GAPDEGREECOUNTER.most_common())
    print("\t".join([str(x) for x in ["",projpercent]+deppercent+edgelenths+[otherlength]]))
예제 #6
0
def main():
    parser = argparse.ArgumentParser(
        description="""Convert conllu to conll format""")
    #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu')
    parser.add_argument(
        '--input',
        help="conllu file",
        default='../data/v2/UD_Spanish-Ancora/es_ancora-ud-train.conllu')
    parser.add_argument('--output',
                        help="target file",
                        type=Path,
                        default="es_train_out.conllu")
    parser.add_argument('--lang',
                        help="specify a language 2-letter code",
                        default="es")
    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x."
              )  #suggestion: install anaconda python
        sys.exit(1)

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(
        args.input
    )  #, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = []
    for s in orig_treebank:
        s = copy.copy(apply_transform(s, args.lang))
        #if not 'multi_tokens' in s.graph.keys():
        #    print(s.get_sentence_as_string())
        modif_treebank.append(s)
    cio.write_conll(modif_treebank,
                    args.output,
                    conllformat='conllu',
                    print_fused_forms=True,
                    print_comments=True)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format',
                        choices=['conll2006', 'conll2006dense', 'conllu'],
                        default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines = [line.strip() for line in open(args.mapping)]
    mapping = {}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target

    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input, num_trees),
          file=sys.stderr)

    split = {mapping[k]: [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping = False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping = True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key),
                        "conll2006")
예제 #8
0
def main():
    parser = argparse.ArgumentParser(description="""Sample k trees from a dependency tree file (w/o replacement)""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")

    parser.add_argument('--k',default=None,help="randomly sample k instances from file", type=int, required=True)
    parser.add_argument('--ignore-first-n',default=0,help="ignore first n sentences in the file", type=int, required=False)
    parser.add_argument('--seed',default=None,help="seed to use")
    parser.add_argument('--ignore-warning', help="if k > size, ignore warning and select all", default=False, action="store_true")

    args = parser.parse_args()

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    if args.seed:
        random.seed(args.seed)
    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)

    if args.k > num_trees:
        if args.ignore_warning:
            print("ignore-warning={}".format(args.ignore_warning),file=sys.stderr)
        else:
            print("k cannot be larger than {} trees. abort. ".format(num_trees))
            exit()
    if args.ignore_first_n >= max(num_trees-args.k,num_trees):
        print("--ignore-first-n cannot be larger than {} trees. abort. ".format(max(num_trees-args.k,num_trees)))
        exit()
        
    if args.ignore_first_n:
        print("ignoring first {} trees in file".format(args.ignore_first_n), file=sys.stderr)
        orig_treebank = orig_treebank[args.ignore_first_n+1:]

    random.shuffle(orig_treebank)
    sample = orig_treebank[0:args.k]
    print("sampled {} trees. seed: {}".format(len(sample), args.seed))
    cio.write_conll(sample, args.output, "conll2006")
예제 #9
0
def main():
    parser = argparse.ArgumentParser(description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines=[line.strip() for line in open(args.mapping)]
    mapping={}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target
    
    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)
    
    split = {mapping[k] : [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping=False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping=True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key), "conll2006")
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument("--input", help="conllu file", default="../data/en-ud-dev.conllu")
    parser.add_argument("--lang")

    parser.add_argument("--posrules", help="head POS rules file", default="../data/posrules.tsv")
    parser.add_argument("--output", help="target file", default="testout.conllu")
    parser.add_argument("--parsing_strategy", choices=["rules", "pagerank", "adjacent"], default="pagerank")
    parser.add_argument(
        "--steps",
        choices=["twotags", "complete", "neighbors", "verbs", "function", "content", "headrule"],
        nargs="+",
        default=[""],
    )
    parser.add_argument("--reverse", action="store_true", default=True)
    parser.add_argument("--rule_backoff", choices=["cycle", "left", "right"], default="left")
    parser.add_argument("--ablation", choices=["pagerank", "2stepdecoding"], default="pagerank")

    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x.")  # suggestion: install anaconda python
        sys.exit(1)

    headrules = pd.read_csv(args.posrules, "\t")
    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)
    ref_treebank = cio.read_conll_u(args.input)
    modif_treebank = []
    posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank)
    functionlist = [x for x, y in wordcounter.most_common(100)]
    print(functionlist)
    fill_out_left_and_right_attach(posbigramcounter)
    if args.parsing_strategy == "pagerank":
        for o, ref in zip(orig_treebank, ref_treebank):
            s = copy.copy(o)
            s.remove_edges_from(s.edges())
            s.remove_node(
                0
            )  # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier

            if "twotags" in args.steps:
                s = map_to_two_tags(s, functionlist)
            if "complete" in args.steps:
                s = add_all_edges(s)
            if "neighbors" in args.steps:
                s = add_short_edges(s)
            if "verbs" in args.steps:
                s = add_verb_edges(s)
            if "function" in args.steps:
                s = manage_function_words(s)
            if "content" in args.steps:
                s = relate_content_words(s)
            if "headrule" in args.steps:
                s = add_head_rule_edges(s, headrules)
            tree_decoding_algorithm_content_and_function(s, headrules, args.reverse, args.ablation)
            modif_treebank.append(s)
            if args.reverse:
                r = ".rev"
            else:
                r = ".norev"
            outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu")
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
            outfile = Path(args.lang + "_" + args.output)
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
    elif args.parsing_strategy == "adjacent":
        for s in orig_treebank:
            s.remove_edges_from(s.edges())
            s = attach_adjacent(s, args.rule_backoff)
            modif_treebank.append(s)
        outfile = Path(args.output + "." + args.rule_backoff)
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)

    else:
        for s in orig_treebank:
            s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff)
            modif_treebank.append(s)

        for k in sorted(scorerdict.keys()):
            prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k])
            reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k])
            print("{0}, {1:.2f}, {2:.2f}".format(k, prec, reca))
        outfile = Path(args.output + ".rules")
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)
from lib.conll import CoNLLReader
from collections import Counter


def getPosDist(sents):
    C = Counter()
    for s in sents:
        for n in s.nodes()[1:]:
            C[s.node[n]["cpostag"]] += 1

    total = sum(C.values())
    for k in C.keys():
        C[k] /= total
    return C


rdr = CoNLLReader()
Acc = Counter()
langs = "ar bg cs da de en es eu fa fi fr he hi hr id it nl pl pt sl sv".split(
)
langs = langs[:3]
for lang in langs:
    filepattern = "/Users/hector/data/parse-holy-data/parse/goldpos/" + lang + "-ud-train.conllu.delex"
    current_sentences = rdr.read_conll_u(filepattern)
    posdist = getPosDist(current_sentences)
    Acc = Acc + posdist

total = len(langs)
for k in Acc.keys():
    Acc[k] /= total
print(Acc)
def main():
    parser = argparse.ArgumentParser(
        description="""Convert conllu to conll format""")
    parser.add_argument('--input',
                        help="conllu file",
                        default='../data/en-ud-dev.conllu')
    parser.add_argument('--lang')

    parser.add_argument('--posrules',
                        help="head POS rules file",
                        default='../data/posrules.tsv')
    parser.add_argument('--output',
                        help="target file",
                        default="testout.conllu")
    parser.add_argument('--parsing_strategy',
                        choices=['rules', 'pagerank', 'adjacent'],
                        default='pagerank')
    parser.add_argument('--steps',
                        choices=[
                            'twotags', 'complete', 'neighbors', 'verbs',
                            'function', 'content', 'headrule'
                        ],
                        nargs='+',
                        default=[""])
    parser.add_argument('--reverse', action='store_true', default=True)
    parser.add_argument('--rule_backoff',
                        choices=['cycle', 'left', 'right'],
                        default="left")
    parser.add_argument('--ablation',
                        choices=['pagerank', '2stepdecoding'],
                        default="pagerank")

    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x."
              )  #suggestion: install anaconda python
        sys.exit(1)

    headrules = pd.read_csv(args.posrules, '\t')
    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)
    ref_treebank = cio.read_conll_u(args.input)
    modif_treebank = []
    posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank)
    functionlist = [x for x, y in wordcounter.most_common(100)]
    print(functionlist)
    fill_out_left_and_right_attach(posbigramcounter)
    if args.parsing_strategy == 'pagerank':
        for o, ref in zip(orig_treebank, ref_treebank):
            s = copy.copy(o)
            s.remove_edges_from(s.edges())
            s.remove_node(
                0
            )  # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier

            if "twotags" in args.steps:
                s = map_to_two_tags(s, functionlist)
            if "complete" in args.steps:
                s = add_all_edges(s)
            if "neighbors" in args.steps:
                s = add_short_edges(s)
            if "verbs" in args.steps:
                s = add_verb_edges(s)
            if "function" in args.steps:
                s = manage_function_words(s)
            if "content" in args.steps:
                s = relate_content_words(s)
            if "headrule" in args.steps:
                s = add_head_rule_edges(s, headrules)
            tree_decoding_algorithm_content_and_function(
                s, headrules, args.reverse, args.ablation)
            modif_treebank.append(s)
            if args.reverse:
                r = ".rev"
            else:
                r = ".norev"
            outfile = Path(args.lang + "_" + args.output + "_" +
                           "_".join(args.steps) + r + ".conllu")
            cio.write_conll(modif_treebank,
                            outfile,
                            conllformat='conllu',
                            print_fused_forms=False,
                            print_comments=False)
            outfile = Path(args.lang + "_" + args.output)
            cio.write_conll(modif_treebank,
                            outfile,
                            conllformat='conllu',
                            print_fused_forms=False,
                            print_comments=False)
    elif args.parsing_strategy == 'adjacent':
        for s in orig_treebank:
            s.remove_edges_from(s.edges())
            s = attach_adjacent(s, args.rule_backoff)
            modif_treebank.append(s)
        outfile = Path(args.output + "." + args.rule_backoff)
        cio.write_conll(modif_treebank,
                        outfile,
                        conllformat='conllu',
                        print_fused_forms=False,
                        print_comments=False)

    else:
        for s in orig_treebank:
            s = add_high_confidence_edges(s, posbigramcounter,
                                          args.rule_backoff)
            modif_treebank.append(s)

        for k in sorted(scorerdict.keys()):
            prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k])
            reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k])
            print('{0}, {1:.2f}, {2:.2f}'.format(k, prec, reca))
        outfile = Path(args.output + ".rules")
        cio.write_conll(modif_treebank,
                        outfile,
                        conllformat='conllu',
                        print_fused_forms=False,
                        print_comments=False)
def main():
    parser = argparse.ArgumentParser(
        description="""Convert conllu to conll format""")
    parser.add_argument('--predicted', help="conllu file")
    parser.add_argument('--gold', help="conllu file")
    parser.add_argument('--lang', help="")

    args = parser.parse_args()

    #try:

    header = [
        "proj_pred", "proj_gold", "leaf_viol_pred", "leaf_viol_gold", "posAcc",
        "UAS"
    ]

    if True:
        vals = []
        rdr = CoNLLReader()
        predicted_sentences = []
        gold_sentences = []

        if args.predicted:
            predicted_sentences = rdr.read_conll_u_8cols(args.predicted)

        if args.gold:
            gold_sentences = rdr.read_conll_u(args.gold)

        numwords = sum([len(s.nodes()[1:]) for s in predicted_sentences])
        #print([int(s.is_fully_projective()) for s in predicted_sentences])
        proj_pred = sum(
            [int(s.is_fully_projective()) for s in predicted_sentences])
        proj_gold = sum([int(s.is_fully_projective()) for s in gold_sentences])
        punct_non__proj_pred = sum(
            [int(s.punct_proj_violations()) for s in predicted_sentences])
        punct_non__proj_gold = sum(
            [int(s.punct_proj_violations()) for s in gold_sentences])

        leaf_violations_pred = sum(
            [s.leaf_violations()[0] for s in predicted_sentences])
        leaf_violations_gold = sum(
            [s.leaf_violations()[0] for s in gold_sentences])
        wrongPOSgoodHeadscore = wrongPOSgoodHead(predicted_sentences,
                                                 gold_sentences)
        posAcc_accum = sum([
            POSAcc(p, g) for p, g in zip(predicted_sentences, gold_sentences)
        ]) / numwords
        UAS_accum = sum(
            [UAS(p, g)
             for p, g in zip(predicted_sentences, gold_sentences)]) / numwords
        prelength = edgelengths(predicted_sentences)
        goldlength = edgelengths(gold_sentences)
        avgprelength = np.std(prelength)
        avggoldlength = np.std(goldlength)

        vals.append(wrongPOSgoodHeadscore)
        vals.append(avgprelength)
        vals.append(avggoldlength)
        vals.append(proj_pred / len(predicted_sentences))
        vals.append(proj_pred / len(predicted_sentences))
        vals.append(proj_gold / len(gold_sentences))
        vals.append(punct_non__proj_pred / numwords)
        vals.append(punct_non__proj_gold / numwords)
        vals.append(leaf_violations_pred / numwords)
        vals.append(leaf_violations_gold / numwords)
        vals.append(KLdivFromMACRO_POS_from_Training(predicted_sentences))
        vals.append(KLdivFromMACRO_POS_from_Training(gold_sentences))
        vals.append(posAcc_accum)
        vals.append(UAS_accum)
        lineout = " ".join([args.lang] + ["{0:.2f}".format(x) for x in vals])
    #except:
    #    lineout = "_\t_"
    print(lineout)
예제 #14
0
parser.add_argument('--input', help="conllu file", default='../data/UD_Spanish-AnCora_frozen/es_ancora-ud-')

args = parser.parse_args()


underscore_counter = Counter()
treebank = []


wordcounter=defaultdict(dict)



for ext in "dev.conllu test.conllu train.conllu".split():
    infile = args.input+ext
    cio = CoNLLReader()
    treebank = treebank + cio.read_conll_u(infile)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)


for s in treebank:
    for n in s.nodes()[1:]:
        lemma = s.node[n]['lemma']
        form = s.node[n]['form']
        cpostag = s.node[n]['cpostag']
        feats = s.node[n]['feats']

        if len(lemma) > 2 and "_" in lemma:
            if cpostag == "DET":
                action = 'shared'
            else:
                action = 'leftmost'