Exemplo n.º 1
0
def validate(inp,out,args,tag_sets):
    for comments,tree in trees(inp,tag_sets,args):
        #the individual lines have been validated already in trees()
        #here go tests which are done on the whole tree
        validate_ID_sequence(tree)
        validate_ID_references(tree)
        validate_token_ranges(tree)
        validate_root(tree)
        validate_deps(tree)
        validate_tree(tree)
        if args.echo_input:
            file_util.print_tree(comments,tree,out)
    validate_newlines(inp)
Exemplo n.º 2
0
def validate(inp, out, args, tag_sets):
    for comments, tree in trees(inp, tag_sets, args):
        #the individual lines have been validated already in trees()
        #here go tests which are done on the whole tree
        validate_ID_sequence(tree)
        validate_ID_references(tree)
        validate_token_ranges(tree)
        validate_root(tree)
        validate_deps(tree)
        validate_tree(tree)
        if args.echo_input:
            file_util.print_tree(comments, tree, out)
    validate_newlines(inp)
def validate(inp,out,args,tag_sets,known_sent_ids):
    global tree_counter
    for comments,tree in trees(inp,tag_sets,args):
        tree_counter+=1
        #the individual lines have been validated already in trees()
        #here go tests which are done on the whole tree
        validate_ID_sequence(tree)
        validate_ID_references(tree)
        validate_token_ranges(tree)
        validate_root(tree)
        validate_deps(tree)
        validate_tree(tree)
        validate_sent_id(comments,known_sent_ids,args.lang)
        if args.check_tree_text:
            validate_text_meta(comments,tree)
        if args.echo_input:
            file_util.print_tree(comments,tree,out)
    validate_newlines(inp)
Exemplo n.º 4
0
def validate(inp, out, args, tag_sets, known_sent_ids):
    global tree_counter
    for comments, tree in trees(inp, tag_sets, args):
        tree_counter += 1
        #the individual lines have been validated already in trees()
        #here go tests which are done on the whole tree
        validate_ID_sequence(tree)
        validate_ID_references(tree)
        validate_token_ranges(tree)
        validate_root(tree)
        validate_deps(tree)
        validate_tree(tree)
        validate_sent_id(comments, known_sent_ids, args.lang)
        if args.check_tree_text:
            validate_text_meta(comments, tree)
        if args.echo_input:
            file_util.print_tree(comments, tree, out)
    validate_newlines(inp)
Exemplo n.º 5
0
            line_idx+=1
            for word_idx,_ in enumerate(range(b,e+1)): #consume as many lines as there are words in the token
                word_ids.append("%d.%d"%(token_idx+1,word_idx+1))
                wtree[line_idx][ID]=word_ids[-1]
                line_idx+=1
    #word_ids is now a list with 1-based indexing which has the new ID for every single word
    #the ID column has been renumbered by now
    #now we can renumber all of the HEAD columns
    for cols in wtree:
        if cols[HEAD]==u"_": #token
            continue
        cols[HEAD]=word_ids[int(cols[HEAD])]
        if cols[DEPS]!=u"_": #need to renumber secondary deps
            new_pairs=[]
            for head_deprel in cols[DEPS].split(u"|"):
                head,deprel=head_deprel.split(u":")
                new_pairs.append(word_ids[int(head)]+u":"+deprel)
            cols[DEPS]=u"|".join(new_pairs)

if __name__=="__main__":
    opt_parser = argparse.ArgumentParser(description='Conversion script from word-based CoNLL-U to token-based CoNLL-U. This script assumes that the input is validated and does no checking on its own.')
    opt_parser.add_argument('input', nargs='?', help='Input file name, or "-" or nothing for standard input.')
    opt_parser.add_argument('output', nargs='?', help='Output file name, or "-" or nothing for standard output.')
    args = opt_parser.parse_args() #Parsed command-line arguments

    inp,out=file_util.in_out(args)
    for comments,tree in file_util.trees(inp):
        w2t(tree)
        file_util.print_tree(comments,tree,out)