Exemplo n.º 1
0
    opt_parser.add_argument(
        '--catvals',
        default=None,
        help=
        'Print category=value pairs. The option can be "UD", "langspec", or "UD+langspec". This distinction is based on the feature, not the value.'
    )
    opt_parser.add_argument(
        '--sort',
        default='freq',
        help=
        'Sort the values by their frequency (freq) or alphabetically (alph). Default: %(default)s.'
    )
    args = opt_parser.parse_args()  #Parsed command-line arguments
    args.output = "-"
    inp, out = file_util.in_out(args, multiple_files=True)
    trees = file_util.trees(inp)

    stats = Stats()
    try:
        for comments, tree in trees:
            stats.tree_count += 1
            for cols in tree:
                stats.count_cols(cols)
    except:
        traceback.print_exc()
        print >> sys.stderr, "\n\n ------- STATS MAY BE EMPTY OR INCOMPLETE ----------"
        pass
    if args.stats:
        stats.print_basic_stats(out)
    if args.jsonstats:
        d = stats.get_stats()
Exemplo n.º 2
0
def sent_set(inp):
    sents = {}  #key: sentence text value: count
    for comment, lines in file_util.trees(inp):
        txt = u" ".join(line[FORM] for line in lines if line[ID].isdigit())
        sents[txt] = sents.get(txt, 0) + 1
    return sents
        if cols[DEPS]!=u"_": #need to renumber secondary deps
            new_pairs=[]
            for head_deprel in cols[DEPS].split(u"|"):
                head,deprel=head_deprel.split(u":")
                new_pairs.append(word_ids[int(head)]+u":"+deprel)
            cols[DEPS]=u"|".join(new_pairs)

if __name__=="__main__":
    opt_parser = argparse.ArgumentParser(description='Conversion script from word-based CoNLL-U to other formats.')
    opt_parser.add_argument('input', nargs='?', help='Input file name, or "-" or nothing for standard input.')
    opt_parser.add_argument('output', nargs='?', help='Output file name, or "-" or nothing for standard output.')
    opt_parser.add_argument('-f','--output-format', default="dgraph", help='Output format. Currently supported: dgraph (CoreNLP dep output). Default: %(default)s.')
    args = opt_parser.parse_args() #Parsed command-line arguments

    inp,out=file_util.in_out(args)
    for comments,tree in file_util.trees(inp):
        deps=set() #A set of (gov,dep,dType) where gov and dep are zero-based indices
        for line in tree:
            if not line[ID].isdigit(): #token line, skip
                continue
            if line[HEAD] not in (u"_",u"0"):
                deps.add((int(line[HEAD])-1,int(line[ID])-1,line[DEPREL]))
            #Process also the DEPS field
            if line[DEPS]!=u"_":
                for head_col_deprel in line[DEPS].split(u"|"):
                    head,deprel=head_col_deprel.split(u":",1)
                    deps.add((int(head)-1,int(line[ID])-1,line[DEPREL]))
        #Done. Maybe these should be sorted somehow? Also, what to do if we have no deps?
        for gov,dep,deprel in sorted(deps):
            print >> out, u"%s(%s-%d, %s-%d)"%(deprel,tree[gov][FORM],gov+1,tree[dep][FORM],dep+1)
        print >> out
        help='Input file name, or "-" or nothing for standard input.')
    opt_parser.add_argument(
        'output',
        nargs='?',
        help='Output file name, or "-" or nothing for standard output.')
    opt_parser.add_argument(
        '-f',
        '--output-format',
        default="dgraph",
        help=
        'Output format. Currently supported: dgraph (CoreNLP dep output). Default: %(default)s.'
    )
    args = opt_parser.parse_args()  #Parsed command-line arguments

    inp, out = file_util.in_out(args)
    for comments, tree in file_util.trees(inp):
        deps = set(
        )  #A set of (gov,dep,dType) where gov and dep are zero-based indices
        for line in tree:
            if not line[ID].isdigit():  #token line, skip
                continue
            if line[HEAD] not in (u"_", u"0"):
                deps.add(
                    (int(line[HEAD]) - 1, int(line[ID]) - 1, line[DEPREL]))
            #Process also the DEPS field
            if line[DEPS] != u"_":
                for head_col_deprel in line[DEPS].split(u"|"):
                    head, deprel = head_col_deprel.split(u":", 1)
                    deps.add((int(head) - 1, int(line[ID]) - 1, line[DEPREL]))
        #Done. Maybe these should be sorted somehow? Also, what to do if we have no deps?
        for gov, dep, deprel in sorted(deps):
Exemplo n.º 5
0
                print >> out, cat_is_val
        
        

if __name__=="__main__":
    opt_parser = argparse.ArgumentParser(description='Script for basic stats generation. Assumes a validated input.')
    opt_parser.add_argument('input', nargs='+', help='Input file name (can be several files), or "-" or nothing for standard input.')
    opt_parser.add_argument('--stats',action='store_true',default=False, help='Print basic stats')
    opt_parser.add_argument('--jsonstats',action='store_true',default=False, help='Print basic stats as json dictionary')
    opt_parser.add_argument('--deprels',default=None,help='Print deprels. The option can be "UD", "langspec", or "UD+langspec".')
    opt_parser.add_argument('--catvals',default=None,help='Print category=value pairs. The option can be "UD", "langspec", or "UD+langspec". This distinction is based on the feature, not the value.')
    opt_parser.add_argument('--sort',default='freq',help='Sort the values by their frequency (freq) or alphabetically (alph). Default: %(default)s.')
    args = opt_parser.parse_args() #Parsed command-line arguments
    args.output="-"
    inp,out=file_util.in_out(args,multiple_files=True)
    trees=file_util.trees(inp)

    stats=Stats()
    try:
        for comments,tree in trees:
            stats.tree_count+=1
            for cols in tree:
                stats.count_cols(cols)
    except:
        traceback.print_exc()
        print >> sys.stderr, "\n\n ------- STATS MAY BE EMPTY OR INCOMPLETE ----------"
        pass
    if args.stats:
        stats.print_basic_stats(out)
    if args.jsonstats:
        d=stats.get_stats()
Exemplo n.º 6
0
def sent_set(inp):
    sents={} #key: sentence text value: count
    for comment,lines in file_util.trees(inp):
        txt=u" ".join(line[FORM] for line in lines if line[ID].isdigit())
        sents[txt]=sents.get(txt,0)+1
    return sents