assert len(x)>0 print_sent(x,[u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---"]) if __name__=="__main__": parser = argparse.ArgumentParser(description='Split/merge long sentences. Use --reverse to merge.') parser.add_argument('--reverse', default=False, action="store_true", help='Reverse the splitting.') parser.add_argument('-N', '--max-len', type=int, default=120, help='Pass sentences shorter or equal to this number of tokens through, split the rest. This will also be the absolute maximum chunk size ever fed into the parser. Default %(default)d.') parser.add_argument('-C', '--chunk-size', type=int, default=80, help='Split into chunks of approximately this size. Default %(default)d.') parser.add_argument('input', nargs='?', help='Input. Nothing or "-" for stdin.') args = parser.parse_args() args.leeway=args.chunk_size//3 #TODO - better value maybe? if args.reverse: last_len=None last_root=None for sent,comments in read_conll(args.input,0): if len(comments)==1 and comments[0]==u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---": part_root=get_root(sent) #root of this chunk renumber(sent,last_len) sent[part_root][HEAD]=unicode(last_root+1) #...dep to the previous one sent[part_root][PHEAD]=unicode(last_root+1) sent[part_root][DEPREL]=u"dep" sent[part_root][PDEPREL]=u"dep" last_root=int(sent[part_root][ID])-1 #...and remember the renumbered root for the possible next one print_sent(sent,[],False) last_len+=len(sent) else: if last_len is not None: print >> out8 last_root=get_root(sent) print_sent(sent,comments,False)
def visualize_clauses(args): data_to_print=u"" count=1 for sent,comments in read_conll(args.input,args.max_sent): d=defaultdict(lambda:[]) for line in sent: if len(line)==10: #conll-u line[5]=sort_feat(line[5]) l=line idx=line[9] else: #conll-09 line[6]=sort_feat(line[6]) l=[line[i] for i in [0,1,2,4,5,6,8,10]] # take idx,token,lemma,pos,pos,feat,deprel,head l.append(u"_") #DEPS l.append(line[12]) #and MISC for CoNLL-U idx=line[12] d[count].append(l) if idx!=u"_": d[idx].append(l) for idx,tree in sorted(d.iteritems()): root=None root_deprel=u"ROOT" root_token=u"ROOT" if idx!=count: indexes={} for i in xrange(0,len(tree)): token=int(tree[i][0]) indexes[token]=len(indexes)+2 for line in tree: line[0]=unicode(indexes[int(line[0])]) if int(line[6]) in indexes: line[6]=unicode(indexes[int(line[6])]) else: # this is root head=int(line[6]) line[6]=u"1" root=line[0] root_deprel=line[7] if head!=0: root_token=d[count][head-1][1] # tree to text text=header text+=u"# sentence-label\t%s\n"%(unicode(idx)) if root is not None: text+=u"# visual-style\t%s\tbgColor:red\n"%(u"1") text+=u"# visual-style %s %s %s\tcolor:red\n"%(u"1",root,root_deprel) if comments: text+=u"\n".join(comments)+u"\n" if idx!=count: root_token=u"**%s**"%(root_token) text+=u"\t".join(t for t in [u"1",root_token,u"_",u"_",u"_",u"_",u"0",root_deprel,u"_",u"_"])+u"\n" for line in tree: text+=u"\t".join(line[i] for i in range(10))+u"\n" text+=u"\n" #conll-u expects an empty line at the end of every tree text+=footer if idx==count or d[idx]!=d[count]: data_to_print+=text count+=1 with codecs.open(os.path.join(SCRIPTDIR,u"templates","simple_brat_viz.html"),u"r",u"utf-8") as template: data=template.read().replace(u"CONTENTGOESHERE",data_to_print,1) print >> sys.stdout, data.encode(u"utf-8")
from visualize import read_conll import codecs import json try: import argparse except ImportError: import compat.argparse as argparse def print_sent(sent): print (u"\n".join(u"\t".join(cols) for cols in sent)).encode(u"utf-8") parser = argparse.ArgumentParser(description='Options') parser.add_argument('-d', required=True, help='Where to save the comments?') args = parser.parse_args() comms=dict() sent_count=0 for sent,comments in read_conll(None,0): sent_count+=1 if comments: comms[sent_count]=comments if sent_count!=1: print print_sent(sent) with codecs.open(args.d,u"w") as f: json.dump(comms,f)
from visualize import read_conll import codecs import json try: import argparse except ImportError: import compat.argparse as argparse def print_sent(sent): print(u"\n".join(u"\t".join(cols) for cols in sent)).encode(u"utf-8") parser = argparse.ArgumentParser(description='Options') parser.add_argument('-d', required=True, help='Where to save the comments?') args = parser.parse_args() comms = dict() sent_count = 0 for sent, comments in read_conll(None, 0): sent_count += 1 if comments: comms[sent_count] = comments if sent_count != 1: print print_sent(sent) with codecs.open(args.d, u"w") as f: json.dump(comms, f)
def visualize_clauses(args): data_to_print = u"" count = 1 for sent, comments in read_conll(args.input, args.max_sent): d = defaultdict(lambda: []) for line in sent: if len(line) == 10: #conll-u line[5] = sort_feat(line[5]) l = line idx = line[9] else: #conll-09 line[6] = sort_feat(line[6]) l = [line[i] for i in [0, 1, 2, 4, 5, 6, 8, 10] ] # take idx,token,lemma,pos,pos,feat,deprel,head l.append(u"_") #DEPS l.append(line[12]) #and MISC for CoNLL-U idx = line[12] d[count].append(l) if idx != u"_": d[idx].append(l) for idx, tree in sorted(d.iteritems()): root = None root_deprel = u"ROOT" root_token = u"ROOT" if idx != count: indexes = {} for i in xrange(0, len(tree)): token = int(tree[i][0]) indexes[token] = len(indexes) + 2 for line in tree: line[0] = unicode(indexes[int(line[0])]) if int(line[6]) in indexes: line[6] = unicode(indexes[int(line[6])]) else: # this is root head = int(line[6]) line[6] = u"1" root = line[0] root_deprel = line[7] if head != 0: root_token = d[count][head - 1][1] # tree to text text = header text += u"# sentence-label\t%s\n" % (unicode(idx)) if root is not None: text += u"# visual-style\t%s\tbgColor:red\n" % (u"1") text += u"# visual-style %s %s %s\tcolor:red\n" % (u"1", root, root_deprel) if comments: text += u"\n".join(comments) + u"\n" if idx != count: root_token = u"**%s**" % (root_token) text += u"\t".join(t for t in [ u"1", root_token, u"_", u"_", u"_", u"_", u"0", root_deprel, u"_", u"_" ]) + u"\n" for line in tree: text += u"\t".join(line[i] for i in range(10)) + u"\n" text += u"\n" #conll-u expects an empty line at the end of every tree text += footer if idx == count or d[idx] != d[count]: data_to_print += text count += 1 with codecs.open( os.path.join(SCRIPTDIR, u"templates", "simple_brat_viz.html"), u"r", u"utf-8") as template: data = template.read().replace(u"CONTENTGOESHERE", data_to_print, 1) print >> sys.stdout, data.encode(u"utf-8")
'-C', '--chunk-size', type=int, default=80, help= 'Split into chunks of approximately this size. Default %(default)d.') parser.add_argument('input', nargs='?', help='Input. Nothing or "-" for stdin.') args = parser.parse_args() args.leeway = args.chunk_size // 3 #TODO - better value maybe? if args.reverse: last_len = None last_root = None for sent, comments in read_conll(args.input, 0): if len(comments) == 1 and comments[ 0] == u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---": part_root = get_root(sent) #root of this chunk renumber(sent, last_len) sent[part_root][HEAD] = unicode(last_root + 1) #...dep to the previous one sent[part_root][PHEAD] = unicode(last_root + 1) sent[part_root][DEPREL] = u"dep" sent[part_root][PDEPREL] = u"dep" last_root = int( sent[part_root][ID] ) - 1 #...and remember the renumbered root for the possible next one print_sent(sent, [], False) last_len += len(sent) else: