def remove_trailing_periods(text, offset): """ :type text: str :type offset: IntPair """ newtext = text newoffset = IntPair(offset.first, offset.second) chars = set(['.', ',', ':', ';', ')', '}', ']', '"', '\'', '?', '!']) if text[-1] in chars: i = 1 while text[-(i+1)] is ' ': i += 1 newtext = text[0:-i] newoffset.second = newoffset.second - i return newtext, newoffset
def process_span_file(doc, filename): """Reads event annotation from filename, and add to doc :type filename: str :type doc: nlplingo.text.text_theory.Document <Event type="CloseAccount"> CloseAccount 0 230 anchor 181 187 CloseAccount/Source 165 170 CloseAccount/Source 171 175 CloseAccount/Source 176 180 CloseAccount/Target 191 198 CloseAccount/Target 207 214 CloseAccount/Target 215 229 </Event> """ lines = [] """:type: list[str]""" with codecs.open(filename, 'r', encoding='utf-8') as f: for line in f: lines.append(line.strip()) i = 0 while i < len(lines): line = lines[i] if line.startswith('<Event type='): event_type = re.search(r' type="(.*?)"', line).group(1) event_id = '{}.e-{}'.format(doc.docid, len(doc.events)) event = Event(event_id, event_type) i += 1 line = lines[i] while not line.startswith('</Event>'): tokens = line.split() info = tokens[0] offset = IntPair(int(tokens[1]), int(tokens[2])) if info == event_type or info == 'anchor' or '/' in info: text = doc.get_text(offset.first, offset.second) if text is None or text == '': print('WARNING: skipping annotation span {} {}-{}'.format(doc.docid, offset.first, offset.second)) else: # sometimes, the UI captures an extra trailing space. Check for that and adjust ending offset if text[-1] == ' ': text = text[0:-1] offset.second = offset.second - 1 if info == event_type: # this is an event span id = '{}.s-{}'.format(event_id, len(event.event_spans)) event.add_event_span(EventSpan(id, offset, text, event_type)) elif info == 'anchor': # anchor span id = '{}.t-{}'.format(event_id, len(event.anchors)) #print('Spannotator, adding ANCHOR with text "{}"'.format(text)) newtext, newoffset = remove_trailing_periods(text, offset) if text != newtext: print('- revising anchor, text=[%s] offset=(%d,%d) newtext=[%s] newoffset=(%d,%d)' % (text, offset.first, offset.second, newtext, newoffset.first, newoffset.second)) event.add_anchor(Anchor(id, newoffset, newtext, event_type)) elif '/' in info: # argument span em_id = 'm-{}-{}'.format(offset.first, offset.second) newtext, newoffset = remove_trailing_periods(text, offset) if text != newtext: print('- revising argument, text=[%s] offset=(%d,%d) newtext=[%s] newoffset=(%d,%d)' % (text, offset.first, offset.second, newtext, newoffset.first, newoffset.second)) em = EntityMention(em_id, newoffset, newtext, 'dummy') # we just use a dummy em first, for creating the EventArgument (notice that this em is not added to the doc) # later, when we annotate sentence, we will find an actual EntityMention that is backed by tokens # and use that to back the EventArgument # Ref: text_theory.annotate_sentence_with_events() arg_role = info[info.index('/') + 1:] arg_id = '{}.t-{}'.format(event_id, len(event.arguments)) event.add_argument(EventArgument(arg_id, em, arg_role)) i += 1 line = lines[i] doc.add_event(event) i += 1 return doc