def process_input_phrase(fname): """ label file split into data and label """ content = utils.readFileEncode(fname, 'utf8') lines = content.split('\n')[:-1] sentences, phrases, labels = [], [], [] phrase, label, text = {}, [], [] oldtype, oldoffset, firstoffset, thislabel, thisrole = '', 0, 0, 0, '' for i in range(len(lines)): if len(lines[i]) > 3: words = lines[i].split('\t') #select only samples that were labeled as in ArgumentList3 if words[2][2:] in ArgumentList3: if words[2].startswith('B-'): if text: phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1],\ 'offset':oldoffset,'firstoffset':firstoffset} phrases.append(phrase) label.append(thisrole) text = [] text.append(words[0]) firstoffset = int(words[1]) thislabel = words[2][2:] thisrole = words[3][2:] elif words[2].startswith('I-'): if words[2][2:] == oldtype[2:]: text.append(words[0]) elif words[2][2:] != oldtype[2:]: if text: phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1], \ 'offset':oldoffset,'firstoffset':firstoffset} phrases.append(phrase) label.append(thisrole) text = [] text.append(words[0]) firstoffset = int(words[1]) thislabel = words[2][2:] thisrole = words[3][2:] oldoffset = int(words[1]) oldtype = words[2] else: if text: phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1], \ 'offset':oldoffset,'firstoffset':firstoffset} phrases.append(phrase) label.append(thisrole) text = [] if len(phrases) > 0 and len(label) > 0: sentences.append(phrases) labels.append(label) phrases = [] label = [] elif len(phrases) == 0 and i < len(lines) - 1: sentences.append([]) labels.append([]) return sentences, labels
def main(): args=parser.parse_args() content=utils.readFileEncode(args.predictedfile,'utf8') lines=content.split('\n')[:-1] if args.metric=='f1': gold, predicted, selectedlist, raw = collect(lines, args.options) mention(gold,predicted,selectedlist,args.O) elif args.metric=='confusion_role': confusion_role(lines) elif args.metric=='confusion_token': gold, predicted, selectedlist, raw = collectlabels(lines, args.options) confusion_token(gold,predicted,selectedlist,raw) elif args.metric=='confusion_label': confusion_label(gold,predicted,selectedlist,raw) print ('=========================================')
def process_input(fname, onlynugget, onlyarg): """ label file split into data and label Input: filename-list of file to be processed onlynugget-set to true if detect nuggets onlyarg-set to true if detect arguments Output: sentence-for each sentence is list of dict of surface word of all files [[{'originalText': ,}]] label-list of label """ content = utils.readFileEncode(fname, 'utf8') lines = content.split('\n')[:-1] sentences = [] labels = [] sent = [] label = [] for i in range(len(lines)): if len(lines[i]) > 3: words = lines[i].split('\t') word = {'originalText': words[0], 'offset': int(words[1])} sent.append(word) if onlynugget: if words[2] in NuggetList10: label.append(words[2]) else: label.append('O') elif onlyarg: if words[2] in ArgumentList: if 'Software' in words[2]: label.append(words[2][0:2] + 'System') else: label.append(words[2]) else: label.append('O') else: if len(sent) > 0 and len(label) > 0: sentences.append(sent) labels.append(label) sent = [] label = [] elif len(sent) == 0 and i < len(lines) - 1: sentences.append([]) labels.append([]) return sentences, labels
def process_input(fname, onlytrigger, onlyarg): """ label file split into data and label """ content = utils.readFileEncode(fname, 'utf8') lines = content.split('\n')[:-1] sentences = [] labels = [] sent = [] label = [] for i in range(len(lines)): if len(lines[i]) > 3: words = lines[i].split('\t') word = {'originalText': words[0]} sent.append(word) if onlytrigger: if words[2] in NuggetList10: label.append(words[2]) else: label.append('O') elif onlyarg: if words[2] in ArgumentList: if 'Software' in words[2]: label.append(words[2][0:2] + 'System') else: label.append(words[2]) else: label.append('O') else: if len(sent) > 0 and len(label) > 0: sentences.append(sent) labels.append(label) sent = [] label = [] elif len(sent) == 0 and i < len(lines) - 1: sentences.append([]) labels.append([]) return sentences, labels
def realis_to_ann(dir, result): for fileno in result.keys(): aid = 1 annfile = dir + fileno + '_pred.ann' content = utils.readFileEncode(annfile, 'utf8') token, event, relationlist, attrlist = ann2xml.readAnn(content) txtfile = dir + fileno + '.txt' head = cuthead(txtfile) f = codecs.open(annfile, 'a', 'utf8') for eventid in event.keys(): triggerid = event[eventid]['triggertokenid'] annoffset = int(token[triggerid]['startOffset']) - head for i in range(len(result[fileno])): for k in result[fileno][i]['offset']: if k == annoffset: f.write('A' + str(aid) + '\t' + 'Realis' + ' ' + eventid + ' ' + result[fileno][i]['pred'] + '\n') aid += 1 break f.close()
def process_input_phrase(fname, labeloption): content = utils.readFileEncode(fname, 'utf8') lines = content.split('\n')[:-1] sentences, phrases, labels = [], [], [] phrase, label, text = {}, [], [] oldtype, offsets, oldevent = '', [], '' for i in range(len(lines)): if len(lines[i]) > 3: words = lines[i].split('\t') # select only samples that were labeled as in ArgumentList3 if words[2] in EventList: if words[2].startswith('B-'): if text: phrase = { 'surface': " ".join(text), 'realislabel': oldtype, 'offset': offsets, 'eventtype': oldevent } if labeloption == 1: #generic vs specific if oldtype == 'Other' or oldtype == 'Actual': label.append("NotGeneric") else: label.append("Generic") phrases.append(phrase) elif labeloption == 2: # not general -> actual vs other if oldtype == 'Generic': pass else: label.append(oldtype) phrases.append(phrase) text, offsets = [], [] text.append(words[0]) offsets.append(int(words[1])) elif words[2].startswith('I-'): if words[2][2:] == oldevent: text.append(words[0]) offsets.append(int(words[1])) elif words[2][2:] != oldevent: if text: phrase = { 'surface': " ".join(text), 'realislabel': oldtype, 'offset': offsets, 'eventtype': oldevent } if labeloption == 1: # generic vs specific if oldtype == 'Other' or oldtype == 'Actual': label.append("NotGeneric") else: label.append("Generic") phrases.append(phrase) elif labeloption == 2: # not generic -> actual vs other if oldtype == 'General': pass else: label.append(oldtype) phrases.append(phrase) text, offsets = [], [] text.append(words[0]) offsets.append(int(words[1])) oldtype = words[4] oldevent = words[2][2:] else: if text: phrase = { 'surface': " ".join(text), 'realislabel': oldtype, 'offset': offsets, 'eventtype': oldevent } if labeloption == 1: # generic vs specific if oldtype == 'Other' or oldtype == 'Actual': label.append("NotGeneric") else: label.append("Generic") phrases.append(phrase) elif labeloption == 2: # not general -> actual vs other if oldtype == 'Generic': pass else: label.append(oldtype) phrases.append(phrase) text, offsets = [], [] if len(phrases) > 0 and len(label) > 0: sentences.append(phrases) labels.append(label) phrases = [] label = [] elif len(phrases) == 0 and i < len(lines) - 1: sentences.append([]) labels.append([]) return sentences, labels
def argument_to_ann(dir, result): for fileno in result.keys(): eventid = 1 tokid = 1 jfile = dir + fileno + '.content.json' content = utils.loadJsontoDict(jfile) sentences = content['sentences'] txtfile = dir + fileno + '.txt' head = cuthead(txtfile) annfile = dir + fileno + '_pred.ann' content = utils.readFileEncode(annfile, 'utf8') token, event, relationlist, attrlist = ann2xml.readAnn(content) idx, event = [], {} for tokenid in token.keys(): # find the last token id from ann idx.append(int(tokenid.replace('T', ''))) event[eventid] = {} event[eventid]['triggertokenid'] = tokenid event[eventid]['name'] = token[tokenid]['label'] event[eventid]['arguments'] = [] eventid += 1 tokid = max(idx) + 1 f = codecs.open(annfile, 'a', 'utf8') sample, offset = [], [] oldlabel = 'O' for wordno in result[fileno].keys(): predlabel = result[fileno][wordno]['pred'] if predlabel.startswith('B-'): if sample: text = " ".join(sample) startoffset = offset[0] + head endoffset = offset[-1] + len(sample[-1]) + head f.write('T' + str(tokid) + '\t' + label + ' ' + str(startoffset) + ' ' + str(endoffset) + '\t' + text + '\n') for eventid in event.keys(): trggrtokenid = event[eventid]['triggertokenid'] if result[fileno][wordno][ 'triggerposition'] == 'before': if startoffset > int( token[trggrtokenid]['endOffset']): if result[fileno][wordno][ 'nearevent'] == token[trggrtokenid][ 'label'] and result[fileno][ wordno]['neartrigger'] in token[ trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break elif result[fileno][wordno][ 'triggerposition'] == 'after': if endoffset > int( token[trggrtokenid]['startOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break else: if startoffset > int(token[tokenid]['endOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break tokid += 1 sample, offset = [], [] sample.append(result[fileno][wordno]['text']) offset.append(result[fileno][wordno]['offset']) label = predlabel[2:] elif predlabel.startswith('I-'): if predlabel[2:] != oldlabel[2:]: if sample: text = " ".join(sample) startoffset = offset[0] + head endoffset = offset[-1] + len(sample[-1]) + head f.write('T' + str(tokid) + '\t' + label + ' ' + str(startoffset) + ' ' + str(endoffset) + '\t' + text + '\n') for eventid in event.keys(): trggrtokenid = event[eventid]['triggertokenid'] if result[fileno][wordno][ 'triggerposition'] == 'before': if startoffset > int( token[trggrtokenid]['endOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break elif result[fileno][wordno][ 'triggerposition'] == 'after': if endoffset > int( token[trggrtokenid]['startOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break else: if startoffset > int( token[tokenid]['endOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break tokid += 1 sample, offset = [], [] sample.append(result[fileno][wordno]['text']) offset.append(result[fileno][wordno]['offset']) label = predlabel[2:] else: sample.append(result[fileno][wordno]['text']) offset.append(result[fileno][wordno]['offset']) oldlabel = result[fileno][wordno]['pred'] if sample: text = " ".join(sample) startoffset = offset[0] + head endoffset = offset[-1] + len(sample[-1]) + head f.write('T' + str(tokid) + '\t' + label + ' ' + str(startoffset) + ' ' + str(endoffset) + '\t' + text + '\n') for eventid in event.keys(): trggrtokenid = event[eventid]['triggertokenid'] if result[fileno][wordno]['triggerposition'] == 'before': if startoffset > int(token[trggrtokenid]['endOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break elif result[fileno][wordno]['triggerposition'] == 'after': if endoffset > int(token[trggrtokenid]['startOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break else: if startoffset > int(token[tokenid]['endOffset']): if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \ result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']: event[eventid]['arguments'].append({ 'argname': label, 'value': tokid }) break for eventid in event.keys(): f.write('E' + str(eventid) + '\t' + event[eventid]['name'] + ':' + event[eventid]['triggertokenid']) for arg in event[eventid]['arguments']: f.write(' ' + arg['argname'] + ':' + 'T' + str(arg['value'])) f.write('\n') f.close()
def cuthead(txtfile): """ find header size """ txt = utils.readFileEncode(txtfile, 'utf-8') return txt.index("<text>") + 7