예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format',
                        choices=['conll2006', 'conll2006dense', 'conllu'],
                        default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines = [line.strip() for line in open(args.mapping)]
    mapping = {}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target

    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input, num_trees),
          file=sys.stderr)

    split = {mapping[k]: [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping = False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping = True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key),
                        "conll2006")
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description="""Extract data based on comments info""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")
    parser.add_argument('--mapping', help="mapping file", required=True)

    args = parser.parse_args()

    lines=[line.strip() for line in open(args.mapping)]
    mapping={}
    for line in lines:
        commentpart, target = line.split()
        mapping[commentpart] = target
    
    print("loaded mapping:", mapping, file=sys.stderr)

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)
    
    split = {mapping[k] : [] for k in mapping.keys()}
    default = "various"
    split[default] = []

    for tree in orig_treebank:
        found_mapping=False
        for token in " ".join(tree.graph['comment']).strip().split():
            if token in mapping:
                split[mapping[token]].append(tree)
                found_mapping=True
                continue
        if not found_mapping:
            split[default].append(tree)

    for key in split:
        print(key, len(split[key]), file=sys.stderr)
        cio.write_conll(split[key], Path(args.output.name + "_" + key), "conll2006")
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description="""Sample k trees from a dependency tree file (w/o replacement)""")
    parser.add_argument('input', help="conllu file")
    parser.add_argument('output', help="target file", type=Path)
    parser.add_argument('--input-format', choices=['conll2006', 'conll2006dense', 'conllu'], default="conllu")

    parser.add_argument('--k',default=None,help="randomly sample k instances from file", type=int, required=True)
    parser.add_argument('--ignore-first-n',default=0,help="ignore first n sentences in the file", type=int, required=False)
    parser.add_argument('--seed',default=None,help="seed to use")
    parser.add_argument('--ignore-warning', help="if k > size, ignore warning and select all", default=False, action="store_true")

    args = parser.parse_args()

    cio = CoNLLReader()
    if args.input_format == "conllu":
        orig_treebank = cio.read_conll_u(args.input)
    elif args.input_format == "conll2006":
        orig_treebank = cio.read_conll_2006(args.input)
    elif args.input_format == "conll2006dense":
        orig_treebank = cio.read_conll_2006_dense(args.input)
    num_trees = len(orig_treebank)

    if args.seed:
        random.seed(args.seed)
    print("Loaded treebank {} with {} sentences".format(args.input,num_trees), file=sys.stderr)

    if args.k > num_trees:
        if args.ignore_warning:
            print("ignore-warning={}".format(args.ignore_warning),file=sys.stderr)
        else:
            print("k cannot be larger than {} trees. abort. ".format(num_trees))
            exit()
    if args.ignore_first_n >= max(num_trees-args.k,num_trees):
        print("--ignore-first-n cannot be larger than {} trees. abort. ".format(max(num_trees-args.k,num_trees)))
        exit()
        
    if args.ignore_first_n:
        print("ignoring first {} trees in file".format(args.ignore_first_n), file=sys.stderr)
        orig_treebank = orig_treebank[args.ignore_first_n+1:]

    random.shuffle(orig_treebank)
    sample = orig_treebank[0:args.k]
    print("sampled {} trees. seed: {}".format(len(sample), args.seed))
    cio.write_conll(sample, args.output, "conll2006")
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument('--infile', help="conllu file", default="/Users/hmartine/proj/eval_multisource/data/2project/watchtower/en.2proj.conll.head1000")

    args = parser.parse_args()

    #try:

    DEPCHILDCOUNTER=Counter()
    GAPDEGREECOUNTER=Counter()
    PROJCOUNTER=Counter()

    header = ["proj_pred", "proj_gold", "leaf_viol_pred", "leaf_viol_gold", "posAcc", "UAS"]
    vals = []
    rdr = CoNLLReader()
    predicted_sentences = []
    gold_sentences = []

    if args.infile:
        gold_sentences = rdr.read_conll_2006_dense(args.infile)

    numwords = sum([len(s.nodes()[1:]) for s in predicted_sentences])
    #print([int(s.is_fully_projective()) for s in predicted_sentences])

    for idx,s in enumerate(gold_sentences):

        local_isproj = s.is_fully_projective()
        localdependentcounter,gapdegreecounter = s.non_projectivity_edge_info()
        PROJCOUNTER.update([local_isproj])
        DEPCHILDCOUNTER+=localdependentcounter
        GAPDEGREECOUNTER+=gapdegreecounter

    projpercent=round(PROJCOUNTER[True]/sum(PROJCOUNTER.values()),2)
    deppercent=[round(DEPCHILDCOUNTER[posname]/sum(DEPCHILDCOUNTER.values()),2) for posname in POSLIST]
    edgelenths = [round(GAPDEGREECOUNTER[l]/sum(GAPDEGREECOUNTER.values()),2) for l in EDGELENGTHS]
    otherlength = round(sum([GAPDEGREECOUNTER[l]/sum(GAPDEGREECOUNTER.values()) for l in GAPDEGREECOUNTER.keys() if l not in EDGELENGTHS]),2)
    #print(Counter(PROJLIST),DEPCHILDCOUNTER.most_common(),GAPDEGREECOUNTER.most_common())
    print("\t".join([str(x) for x in ["",projpercent]+deppercent+edgelenths+[otherlength]]))