def main(): parser = argparse.ArgumentParser( description="""Extract data based on comments info""") parser.add_argument('input', help="conllu file") parser.add_argument('output', help="target file", type=Path) parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu") parser.add_argument('--mapping', help="mapping file", required=True) args = parser.parse_args() lines = [line.strip() for line in open(args.mapping)] mapping = {} for line in lines: commentpart, target = line.split() mapping[commentpart] = target print("loaded mapping:", mapping, file=sys.stderr) cio = CoNLLReader() if args.input_format == "conllu": orig_treebank = cio.read_conll_u(args.input) elif args.input_format == "conll2006": orig_treebank = cio.read_conll_2006(args.input) elif args.input_format == "conll2006dense": orig_treebank = cio.read_conll_2006_dense(args.input) num_trees = len(orig_treebank) print("Loaded treebank {} with {} sentences".format(args.input, num_trees), file=sys.stderr) split = {mapping[k]: [] for k in mapping.keys()} default = "various" split[default] = [] for tree in orig_treebank: found_mapping = False for token in " ".join(tree.graph['comment']).strip().split(): if token in mapping: split[mapping[token]].append(tree) found_mapping = True continue if not found_mapping: split[default].append(tree) for key in split: print(key, len(split[key]), file=sys.stderr) cio.write_conll(split[key], Path(args.output.name + "_" + key), "conll2006")
def main(): parser = argparse.ArgumentParser(description="""Extract data based on comments info""") parser.add_argument('input', help="conllu file") parser.add_argument('output', help="target file", type=Path) parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu") parser.add_argument('--mapping', help="mapping file", required=True) args = parser.parse_args() lines=[line.strip() for line in open(args.mapping)] mapping={} for line in lines: commentpart, target = line.split() mapping[commentpart] = target print("loaded mapping:", mapping, file=sys.stderr) cio = CoNLLReader() if args.input_format == "conllu": orig_treebank = cio.read_conll_u(args.input) elif args.input_format == "conll2006": orig_treebank = cio.read_conll_2006(args.input) elif args.input_format == "conll2006dense": orig_treebank = cio.read_conll_2006_dense(args.input) num_trees = len(orig_treebank) print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr) split = {mapping[k] : [] for k in mapping.keys()} default = "various" split[default] = [] for tree in orig_treebank: found_mapping=False for token in " ".join(tree.graph['comment']).strip().split(): if token in mapping: split[mapping[token]].append(tree) found_mapping=True continue if not found_mapping: split[default].append(tree) for key in split: print(key, len(split[key]), file=sys.stderr) cio.write_conll(split[key], Path(args.output.name + "_" + key), "conll2006")
def main(): parser = argparse.ArgumentParser(description="""Sample k trees from a dependency tree file (w/o replacement)""") parser.add_argument('input', help="conllu file") parser.add_argument('output', help="target file", type=Path) parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu") parser.add_argument('--k',default=None,help="randomly sample k instances from file", type=int, required=True) parser.add_argument('--ignore-first-n',default=0,help="ignore first n sentences in the file", type=int, required=False) parser.add_argument('--seed',default=None,help="seed to use") parser.add_argument('--ignore-warning', help="if k > size, ignore warning and select all", default=False, action="store_true") args = parser.parse_args() cio = CoNLLReader() if args.input_format == "conllu": orig_treebank = cio.read_conll_u(args.input) elif args.input_format == "conll2006": orig_treebank = cio.read_conll_2006(args.input) elif args.input_format == "conll2006dense": orig_treebank = cio.read_conll_2006_dense(args.input) num_trees = len(orig_treebank) if args.seed: random.seed(args.seed) print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr) if args.k > num_trees: if args.ignore_warning: print("ignore-warning={}".format(args.ignore_warning),file=sys.stderr) else: print("k cannot be larger than {} trees. abort. ".format(num_trees)) exit() if args.ignore_first_n >= max(num_trees-args.k,num_trees): print("--ignore-first-n cannot be larger than {} trees. abort. ".format(max(num_trees-args.k,num_trees))) exit() if args.ignore_first_n: print("ignoring first {} trees in file".format(args.ignore_first_n), file=sys.stderr) orig_treebank = orig_treebank[args.ignore_first_n+1:] random.shuffle(orig_treebank) sample = orig_treebank[0:args.k] print("sampled {} trees. seed: {}".format(len(sample), args.seed)) cio.write_conll(sample, args.output, "conll2006")
def main(): parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") parser.add_argument('input', help="conllu file") parser.add_argument('output', help="target file", type=Path) parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true") parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true") parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'], metavar='prop', type=str, nargs='+') parser.add_argument('--lang', help="specify a language 2-letter code", default="default") parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006") parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true") parser.add_argument('--print_comments',default=False,action="store_true") parser.add_argument('--print_fused_forms',default=False,action="store_true") args = parser.parse_args() if sys.version_info < (3,0): print("Sorry, requires Python 3.x.") #suggestion: install anaconda python sys.exit(1) POSRANKPRECEDENCEDICT = defaultdict(list) POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ") # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ") POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ") POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ") POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ") if args.lang in POSRANKPRECEDENCEDICT: current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang] else: current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"] cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) modif_treebank = copy.copy(orig_treebank) # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list # We keep it for future modifications, i.e. any language-specific modules for s in modif_treebank: # print('sentence', s.get_sentence_as_string(printid=True)) s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics) cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
def main(): parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") parser.add_argument('input', help="conllu file") parser.add_argument('output', help="target file", type=Path) parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true") parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true") parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'], metavar='prop', type=str, nargs='+') parser.add_argument('--lang', help="specify a language 2-letter code", default="default") parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006") parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true") parser.add_argument('--print_comments',default=False,action="store_true") parser.add_argument('--print_fused_forms',default=False,action="store_true") args = parser.parse_args() if sys.version_info < (3,0): print("Sorry, requires Python 3.x.") #suggestion: install anaconda python sys.exit(1) POSRANKPRECEDENCEDICT = defaultdict(list) POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CONJ SCONJ X PUNCT ".split(" ") POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ") POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ") POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ") POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET".split(" ") if args.lang in POSRANKPRECEDENCEDICT: current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang] else: current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"] cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) modif_treebank = copy.copy(orig_treebank) # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list # We keep it for future modifications, i.e. any language-specific modules for s in modif_treebank: s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics) cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
def main(): parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu') parser.add_argument('--input', help="conllu file", default='../..//UD_Catalan/ca-all.conllu') parser.add_argument('--output', help="target file", type=Path,default="catout.conllu") parser.add_argument('--lang', help="specify a language 2-letter code", default="default") args = parser.parse_args() if sys.version_info < (3,0): print("Sorry, requires Python 3.x.") #suggestion: install anaconda python sys.exit(1) cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) modif_treebank = [] for s in orig_treebank: s = copy.copy(apply_transform(s,args.lang)) #if not 'multi_tokens' in s.graph.keys(): # print(s.get_sentence_as_string()) modif_treebank.append(s) cio.write_conll(modif_treebank,args.output,conllformat='conllu', print_fused_forms=True,print_comments=True)
def main(): parser = argparse.ArgumentParser( description="""Convert conllu to conll format""") #parser.add_argument('--input', help="conllu file", default='../..//UD_Spanish-AnCora/es_ancora-all.conllu') parser.add_argument( '--input', help="conllu file", default='../data/v2/UD_Spanish-Ancora/es_ancora-ud-train.conllu') parser.add_argument('--output', help="target file", type=Path, default="es_train_out.conllu") parser.add_argument('--lang', help="specify a language 2-letter code", default="es") args = parser.parse_args() if sys.version_info < (3, 0): print("Sorry, requires Python 3.x." ) #suggestion: install anaconda python sys.exit(1) cio = CoNLLReader() orig_treebank = cio.read_conll_u( args.input ) #, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) modif_treebank = [] for s in orig_treebank: s = copy.copy(apply_transform(s, args.lang)) #if not 'multi_tokens' in s.graph.keys(): # print(s.get_sentence_as_string()) modif_treebank.append(s) cio.write_conll(modif_treebank, args.output, conllformat='conllu', print_fused_forms=True, print_comments=True)
def main(): parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") parser.add_argument("--input", help="conllu file", default="../data/en-ud-dev.conllu") parser.add_argument("--lang") parser.add_argument("--posrules", help="head POS rules file", default="../data/posrules.tsv") parser.add_argument("--output", help="target file", default="testout.conllu") parser.add_argument("--parsing_strategy", choices=["rules", "pagerank", "adjacent"], default="pagerank") parser.add_argument( "--steps", choices=["twotags", "complete", "neighbors", "verbs", "function", "content", "headrule"], nargs="+", default=[""], ) parser.add_argument("--reverse", action="store_true", default=True) parser.add_argument("--rule_backoff", choices=["cycle", "left", "right"], default="left") parser.add_argument("--ablation", choices=["pagerank", "2stepdecoding"], default="pagerank") args = parser.parse_args() if sys.version_info < (3, 0): print("Sorry, requires Python 3.x.") # suggestion: install anaconda python sys.exit(1) headrules = pd.read_csv(args.posrules, "\t") cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input) ref_treebank = cio.read_conll_u(args.input) modif_treebank = [] posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank) functionlist = [x for x, y in wordcounter.most_common(100)] print(functionlist) fill_out_left_and_right_attach(posbigramcounter) if args.parsing_strategy == "pagerank": for o, ref in zip(orig_treebank, ref_treebank): s = copy.copy(o) s.remove_edges_from(s.edges()) s.remove_node( 0 ) # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier if "twotags" in args.steps: s = map_to_two_tags(s, functionlist) if "complete" in args.steps: s = add_all_edges(s) if "neighbors" in args.steps: s = add_short_edges(s) if "verbs" in args.steps: s = add_verb_edges(s) if "function" in args.steps: s = manage_function_words(s) if "content" in args.steps: s = relate_content_words(s) if "headrule" in args.steps: s = add_head_rule_edges(s, headrules) tree_decoding_algorithm_content_and_function(s, headrules, args.reverse, args.ablation) modif_treebank.append(s) if args.reverse: r = ".rev" else: r = ".norev" outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu") cio.write_conll( modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False ) outfile = Path(args.lang + "_" + args.output) cio.write_conll( modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False ) elif args.parsing_strategy == "adjacent": for s in orig_treebank: s.remove_edges_from(s.edges()) s = attach_adjacent(s, args.rule_backoff) modif_treebank.append(s) outfile = Path(args.output + "." + args.rule_backoff) cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False) else: for s in orig_treebank: s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff) modif_treebank.append(s) for k in sorted(scorerdict.keys()): prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k]) reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k]) print("{0}, {1:.2f}, {2:.2f}".format(k, prec, reca)) outfile = Path(args.output + ".rules") cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)
from lib.conll import CoNLLReader from collections import Counter def getPosDist(sents): C = Counter() for s in sents: for n in s.nodes()[1:]: C[s.node[n]["cpostag"]] += 1 total = sum(C.values()) for k in C.keys(): C[k] /= total return C rdr = CoNLLReader() Acc = Counter() langs = "ar bg cs da de en es eu fa fi fr he hi hr id it nl pl pt sl sv".split( ) langs = langs[:3] for lang in langs: filepattern = "/Users/hector/data/parse-holy-data/parse/goldpos/" + lang + "-ud-train.conllu.delex" current_sentences = rdr.read_conll_u(filepattern) posdist = getPosDist(current_sentences) Acc = Acc + posdist total = len(langs) for k in Acc.keys(): Acc[k] /= total print(Acc)
def main(): parser = argparse.ArgumentParser( description="""Convert conllu to conll format""") parser.add_argument('--input', help="conllu file", default='../data/en-ud-dev.conllu') parser.add_argument('--lang') parser.add_argument('--posrules', help="head POS rules file", default='../data/posrules.tsv') parser.add_argument('--output', help="target file", default="testout.conllu") parser.add_argument('--parsing_strategy', choices=['rules', 'pagerank', 'adjacent'], default='pagerank') parser.add_argument('--steps', choices=[ 'twotags', 'complete', 'neighbors', 'verbs', 'function', 'content', 'headrule' ], nargs='+', default=[""]) parser.add_argument('--reverse', action='store_true', default=True) parser.add_argument('--rule_backoff', choices=['cycle', 'left', 'right'], default="left") parser.add_argument('--ablation', choices=['pagerank', '2stepdecoding'], default="pagerank") args = parser.parse_args() if sys.version_info < (3, 0): print("Sorry, requires Python 3.x." ) #suggestion: install anaconda python sys.exit(1) headrules = pd.read_csv(args.posrules, '\t') cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input) ref_treebank = cio.read_conll_u(args.input) modif_treebank = [] posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank) functionlist = [x for x, y in wordcounter.most_common(100)] print(functionlist) fill_out_left_and_right_attach(posbigramcounter) if args.parsing_strategy == 'pagerank': for o, ref in zip(orig_treebank, ref_treebank): s = copy.copy(o) s.remove_edges_from(s.edges()) s.remove_node( 0 ) # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier if "twotags" in args.steps: s = map_to_two_tags(s, functionlist) if "complete" in args.steps: s = add_all_edges(s) if "neighbors" in args.steps: s = add_short_edges(s) if "verbs" in args.steps: s = add_verb_edges(s) if "function" in args.steps: s = manage_function_words(s) if "content" in args.steps: s = relate_content_words(s) if "headrule" in args.steps: s = add_head_rule_edges(s, headrules) tree_decoding_algorithm_content_and_function( s, headrules, args.reverse, args.ablation) modif_treebank.append(s) if args.reverse: r = ".rev" else: r = ".norev" outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu") cio.write_conll(modif_treebank, outfile, conllformat='conllu', print_fused_forms=False, print_comments=False) outfile = Path(args.lang + "_" + args.output) cio.write_conll(modif_treebank, outfile, conllformat='conllu', print_fused_forms=False, print_comments=False) elif args.parsing_strategy == 'adjacent': for s in orig_treebank: s.remove_edges_from(s.edges()) s = attach_adjacent(s, args.rule_backoff) modif_treebank.append(s) outfile = Path(args.output + "." + args.rule_backoff) cio.write_conll(modif_treebank, outfile, conllformat='conllu', print_fused_forms=False, print_comments=False) else: for s in orig_treebank: s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff) modif_treebank.append(s) for k in sorted(scorerdict.keys()): prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k]) reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k]) print('{0}, {1:.2f}, {2:.2f}'.format(k, prec, reca)) outfile = Path(args.output + ".rules") cio.write_conll(modif_treebank, outfile, conllformat='conllu', print_fused_forms=False, print_comments=False)
def main(): parser = argparse.ArgumentParser( description="""Convert conllu to conll format""") parser.add_argument('--predicted', help="conllu file") parser.add_argument('--gold', help="conllu file") parser.add_argument('--lang', help="") args = parser.parse_args() #try: header = [ "proj_pred", "proj_gold", "leaf_viol_pred", "leaf_viol_gold", "posAcc", "UAS" ] if True: vals = [] rdr = CoNLLReader() predicted_sentences = [] gold_sentences = [] if args.predicted: predicted_sentences = rdr.read_conll_u_8cols(args.predicted) if args.gold: gold_sentences = rdr.read_conll_u(args.gold) numwords = sum([len(s.nodes()[1:]) for s in predicted_sentences]) #print([int(s.is_fully_projective()) for s in predicted_sentences]) proj_pred = sum( [int(s.is_fully_projective()) for s in predicted_sentences]) proj_gold = sum([int(s.is_fully_projective()) for s in gold_sentences]) punct_non__proj_pred = sum( [int(s.punct_proj_violations()) for s in predicted_sentences]) punct_non__proj_gold = sum( [int(s.punct_proj_violations()) for s in gold_sentences]) leaf_violations_pred = sum( [s.leaf_violations()[0] for s in predicted_sentences]) leaf_violations_gold = sum( [s.leaf_violations()[0] for s in gold_sentences]) wrongPOSgoodHeadscore = wrongPOSgoodHead(predicted_sentences, gold_sentences) posAcc_accum = sum([ POSAcc(p, g) for p, g in zip(predicted_sentences, gold_sentences) ]) / numwords UAS_accum = sum( [UAS(p, g) for p, g in zip(predicted_sentences, gold_sentences)]) / numwords prelength = edgelengths(predicted_sentences) goldlength = edgelengths(gold_sentences) avgprelength = np.std(prelength) avggoldlength = np.std(goldlength) vals.append(wrongPOSgoodHeadscore) vals.append(avgprelength) vals.append(avggoldlength) vals.append(proj_pred / len(predicted_sentences)) vals.append(proj_pred / len(predicted_sentences)) vals.append(proj_gold / len(gold_sentences)) vals.append(punct_non__proj_pred / numwords) vals.append(punct_non__proj_gold / numwords) vals.append(leaf_violations_pred / numwords) vals.append(leaf_violations_gold / numwords) vals.append(KLdivFromMACRO_POS_from_Training(predicted_sentences)) vals.append(KLdivFromMACRO_POS_from_Training(gold_sentences)) vals.append(posAcc_accum) vals.append(UAS_accum) lineout = " ".join([args.lang] + ["{0:.2f}".format(x) for x in vals]) #except: # lineout = "_\t_" print(lineout)
args = parser.parse_args() underscore_counter = Counter() treebank = [] wordcounter=defaultdict(dict) for ext in "dev.conllu test.conllu train.conllu".split(): infile = args.input+ext cio = CoNLLReader() treebank = treebank + cio.read_conll_u(infile)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) for s in treebank: for n in s.nodes()[1:]: lemma = s.node[n]['lemma'] form = s.node[n]['form'] cpostag = s.node[n]['cpostag'] feats = s.node[n]['feats'] if len(lemma) > 2 and "_" in lemma: if cpostag == "DET": action = 'shared' else: action = 'leftmost' underscore_counter[(form,cpostag,lemma,feats,action)]+=1