示例#1
0
def main():
    parser = argparse.ArgumentParser(
        description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format',
                        choices=['conll2006', 'conll2006dense', 'conllu'],
                        default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines = [line.strip() for line in open(args.mapping)]
    mapping = {}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target

    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input, num_trees),
          file=sys.stderr)

    split = {mapping[k]: [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping = False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping = True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key),
                        "conll2006")
示例#2
0
def main():
    parser = argparse.ArgumentParser(description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines=[line.strip() for line in open(args.mapping)]
    mapping={}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target
    
    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)
    
    split = {mapping[k] : [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping=False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping=True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key), "conll2006")
示例#3
0
def main():
    parser = argparse.ArgumentParser(description="""Sample k trees from a dependency tree file (w/o replacement)""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")

    parser.add_argument('--k',default=None,help="randomly sample k instances from file", type=int, required=True)
    parser.add_argument('--ignore-first-n',default=0,help="ignore first n sentences in the file", type=int, required=False)
    parser.add_argument('--seed',default=None,help="seed to use")
    parser.add_argument('--ignore-warning', help="if k > size, ignore warning and select all", default=False, action="store_true")

    args = parser.parse_args()

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    if args.seed:
        random.seed(args.seed)
    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)

    if args.k > num_trees:
        if args.ignore_warning:
            print("ignore-warning={}".format(args.ignore_warning),file=sys.stderr)
        else:
            print("k cannot be larger than {} trees. abort. ".format(num_trees))
            exit()
    if args.ignore_first_n >= max(num_trees-args.k,num_trees):
        print("--ignore-first-n cannot be larger than {} trees. abort. ".format(max(num_trees-args.k,num_trees)))
        exit()
        
    if args.ignore_first_n:
        print("ignoring first {} trees in file".format(args.ignore_first_n), file=sys.stderr)
        orig_treebank = orig_treebank[args.ignore_first_n+1:]

    random.shuffle(orig_treebank)
    sample = orig_treebank[0:args.k]
    print("sampled {} trees. seed: {}".format(len(sample), args.seed))
    cio.write_conll(sample, args.output, "conll2006")
示例#4
0
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
    parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
    parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
    parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
    parser.add_argument('--print_comments',default=False,action="store_true")
    parser.add_argument('--print_fused_forms',default=False,action="store_true")

    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    POSRANKPRECEDENCEDICT = defaultdict(list)
    POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ")
    # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
    POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
    POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
    POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ")

    if args.lang in POSRANKPRECEDENCEDICT:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
    else:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = copy.copy(orig_treebank)

    # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
    # We keep it for future modifications, i.e. any language-specific modules
    for s in modif_treebank:
        # print('sentence', s.get_sentence_as_string(printid=True))
        s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)

    cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
    parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
    parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
    parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
    parser.add_argument('--print_comments',default=False,action="store_true")
    parser.add_argument('--print_fused_forms',default=False,action="store_true")

    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    POSRANKPRECEDENCEDICT = defaultdict(list)
    POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CONJ SCONJ X PUNCT ".split(" ")
    POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
    POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
    POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
    POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET".split(" ")

    if args.lang in POSRANKPRECEDENCEDICT:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
    else:
        current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = copy.copy(orig_treebank)

    # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
    # We keep it for future modifications, i.e. any language-specific modules
    for s in modif_treebank:
        s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)

    cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu')
    parser.add_argument('--input', help="conllu file", default='../..//UD_Catalan/ca-all.conllu')
    parser.add_argument('--output', help="target file", type=Path,default="catout.conllu")
    parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
    args = parser.parse_args()

    if sys.version_info < (3,0):
        print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
        sys.exit(1)

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = []
    for s in orig_treebank:
        s = copy.copy(apply_transform(s,args.lang))
        #if not 'multi_tokens' in s.graph.keys():
        #    print(s.get_sentence_as_string())
        modif_treebank.append(s)
    cio.write_conll(modif_treebank,args.output,conllformat='conllu', print_fused_forms=True,print_comments=True)
示例#7
0
def main():
    parser = argparse.ArgumentParser(
        description="""Convert conllu to conll format""")
    #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu')
    parser.add_argument(
        '--input',
        help="conllu file",
        default='../data/v2/UD_Spanish-Ancora/es_ancora-ud-train.conllu')
    parser.add_argument('--output',
                        help="target file",
                        type=Path,
                        default="es_train_out.conllu")
    parser.add_argument('--lang',
                        help="specify a language 2-letter code",
                        default="es")
    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x."
              )  #suggestion: install anaconda python
        sys.exit(1)

    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(
        args.input
    )  #, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
    modif_treebank = []
    for s in orig_treebank:
        s = copy.copy(apply_transform(s, args.lang))
        #if not 'multi_tokens' in s.graph.keys():
        #    print(s.get_sentence_as_string())
        modif_treebank.append(s)
    cio.write_conll(modif_treebank,
                    args.output,
                    conllformat='conllu',
                    print_fused_forms=True,
                    print_comments=True)
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument("--input", help="conllu file", default="../data/en-ud-dev.conllu")
    parser.add_argument("--lang")

    parser.add_argument("--posrules", help="head POS rules file", default="../data/posrules.tsv")
    parser.add_argument("--output", help="target file", default="testout.conllu")
    parser.add_argument("--parsing_strategy", choices=["rules", "pagerank", "adjacent"], default="pagerank")
    parser.add_argument(
        "--steps",
        choices=["twotags", "complete", "neighbors", "verbs", "function", "content", "headrule"],
        nargs="+",
        default=[""],
    )
    parser.add_argument("--reverse", action="store_true", default=True)
    parser.add_argument("--rule_backoff", choices=["cycle", "left", "right"], default="left")
    parser.add_argument("--ablation", choices=["pagerank", "2stepdecoding"], default="pagerank")

    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x.")  # suggestion: install anaconda python
        sys.exit(1)

    headrules = pd.read_csv(args.posrules, "\t")
    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)
    ref_treebank = cio.read_conll_u(args.input)
    modif_treebank = []
    posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank)
    functionlist = [x for x, y in wordcounter.most_common(100)]
    print(functionlist)
    fill_out_left_and_right_attach(posbigramcounter)
    if args.parsing_strategy == "pagerank":
        for o, ref in zip(orig_treebank, ref_treebank):
            s = copy.copy(o)
            s.remove_edges_from(s.edges())
            s.remove_node(
                0
            )  # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier

            if "twotags" in args.steps:
                s = map_to_two_tags(s, functionlist)
            if "complete" in args.steps:
                s = add_all_edges(s)
            if "neighbors" in args.steps:
                s = add_short_edges(s)
            if "verbs" in args.steps:
                s = add_verb_edges(s)
            if "function" in args.steps:
                s = manage_function_words(s)
            if "content" in args.steps:
                s = relate_content_words(s)
            if "headrule" in args.steps:
                s = add_head_rule_edges(s, headrules)
            tree_decoding_algorithm_content_and_function(s, headrules, args.reverse, args.ablation)
            modif_treebank.append(s)
            if args.reverse:
                r = ".rev"
            else:
                r = ".norev"
            outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu")
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
            outfile = Path(args.lang + "_" + args.output)
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
    elif args.parsing_strategy == "adjacent":
        for s in orig_treebank:
            s.remove_edges_from(s.edges())
            s = attach_adjacent(s, args.rule_backoff)
            modif_treebank.append(s)
        outfile = Path(args.output + "." + args.rule_backoff)
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)

    else:
        for s in orig_treebank:
            s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff)
            modif_treebank.append(s)

        for k in sorted(scorerdict.keys()):
            prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k])
            reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k])
            print("{0}, {1:.2f}, {2:.2f}".format(k, prec, reca))
        outfile = Path(args.output + ".rules")
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)
def main():
    parser = argparse.ArgumentParser(
        description="""Convert conllu to conll format""")
    parser.add_argument('--input',
                        help="conllu file",
                        default='../data/en-ud-dev.conllu')
    parser.add_argument('--lang')

    parser.add_argument('--posrules',
                        help="head POS rules file",
                        default='../data/posrules.tsv')
    parser.add_argument('--output',
                        help="target file",
                        default="testout.conllu")
    parser.add_argument('--parsing_strategy',
                        choices=['rules', 'pagerank', 'adjacent'],
                        default='pagerank')
    parser.add_argument('--steps',
                        choices=[
                            'twotags', 'complete', 'neighbors', 'verbs',
                            'function', 'content', 'headrule'
                        ],
                        nargs='+',
                        default=[""])
    parser.add_argument('--reverse', action='store_true', default=True)
    parser.add_argument('--rule_backoff',
                        choices=['cycle', 'left', 'right'],
                        default="left")
    parser.add_argument('--ablation',
                        choices=['pagerank', '2stepdecoding'],
                        default="pagerank")

    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x."
              )  #suggestion: install anaconda python
        sys.exit(1)

    headrules = pd.read_csv(args.posrules, '\t')
    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)
    ref_treebank = cio.read_conll_u(args.input)
    modif_treebank = []
    posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank)
    functionlist = [x for x, y in wordcounter.most_common(100)]
    print(functionlist)
    fill_out_left_and_right_attach(posbigramcounter)
    if args.parsing_strategy == 'pagerank':
        for o, ref in zip(orig_treebank, ref_treebank):
            s = copy.copy(o)
            s.remove_edges_from(s.edges())
            s.remove_node(
                0
            )  # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier

            if "twotags" in args.steps:
                s = map_to_two_tags(s, functionlist)
            if "complete" in args.steps:
                s = add_all_edges(s)
            if "neighbors" in args.steps:
                s = add_short_edges(s)
            if "verbs" in args.steps:
                s = add_verb_edges(s)
            if "function" in args.steps:
                s = manage_function_words(s)
            if "content" in args.steps:
                s = relate_content_words(s)
            if "headrule" in args.steps:
                s = add_head_rule_edges(s, headrules)
            tree_decoding_algorithm_content_and_function(
                s, headrules, args.reverse, args.ablation)
            modif_treebank.append(s)
            if args.reverse:
                r = ".rev"
            else:
                r = ".norev"
            outfile = Path(args.lang + "_" + args.output + "_" +
                           "_".join(args.steps) + r + ".conllu")
            cio.write_conll(modif_treebank,
                            outfile,
                            conllformat='conllu',
                            print_fused_forms=False,
                            print_comments=False)
            outfile = Path(args.lang + "_" + args.output)
            cio.write_conll(modif_treebank,
                            outfile,
                            conllformat='conllu',
                            print_fused_forms=False,
                            print_comments=False)
    elif args.parsing_strategy == 'adjacent':
        for s in orig_treebank:
            s.remove_edges_from(s.edges())
            s = attach_adjacent(s, args.rule_backoff)
            modif_treebank.append(s)
        outfile = Path(args.output + "." + args.rule_backoff)
        cio.write_conll(modif_treebank,
                        outfile,
                        conllformat='conllu',
                        print_fused_forms=False,
                        print_comments=False)

    else:
        for s in orig_treebank:
            s = add_high_confidence_edges(s, posbigramcounter,
                                          args.rule_backoff)
            modif_treebank.append(s)

        for k in sorted(scorerdict.keys()):
            prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k])
            reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k])
            print('{0}, {1:.2f}, {2:.2f}'.format(k, prec, reca))
        outfile = Path(args.output + ".rules")
        cio.write_conll(modif_treebank,
                        outfile,
                        conllformat='conllu',
                        print_fused_forms=False,
                        print_comments=False)