Exemplo n.º 1
0
def remove_trailing_periods(text, offset):
    """
    :type text: str
    :type offset: IntPair
    """
    newtext = text
    newoffset = IntPair(offset.first, offset.second)
    chars = set(['.', ',', ':', ';', ')', '}', ']', '"', '\'', '?', '!'])
    if text[-1] in chars:
        i = 1
        while text[-(i+1)] is ' ':
            i += 1
        newtext = text[0:-i]
        newoffset.second = newoffset.second - i
    return newtext, newoffset
Exemplo n.º 2
0
def process_span_file(doc, filename):
    """Reads event annotation from filename, and add to doc

    :type filename: str
    :type doc: nlplingo.text.text_theory.Document

    <Event type="CloseAccount">
    CloseAccount	0	230
    anchor	181	187
    CloseAccount/Source	165	170
    CloseAccount/Source	171	175
    CloseAccount/Source	176	180
    CloseAccount/Target	191	198
    CloseAccount/Target	207	214
    CloseAccount/Target	215	229
    </Event>
    """
    lines = []
    """:type: list[str]"""
    with codecs.open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip())

    i = 0
    while i < len(lines):
        line = lines[i]
        if line.startswith('<Event type='):
            event_type = re.search(r' type="(.*?)"', line).group(1)
            event_id = '{}.e-{}'.format(doc.docid, len(doc.events))
            event = Event(event_id, event_type)

            i += 1
            line = lines[i]
            while not line.startswith('</Event>'):
                tokens = line.split()
                info = tokens[0]
                offset = IntPair(int(tokens[1]), int(tokens[2]))

                if info == event_type or info == 'anchor' or '/' in info:
                    text = doc.get_text(offset.first, offset.second)
                    if text is None or text == '':
                        print('WARNING: skipping annotation span {} {}-{}'.format(doc.docid, offset.first, offset.second))
                    else:
                        # sometimes, the UI captures an extra trailing space. Check for that and adjust ending offset
                        if text[-1] == ' ':
                            text = text[0:-1]
                            offset.second = offset.second - 1

                        if info == event_type:  # this is an event span
                            id = '{}.s-{}'.format(event_id, len(event.event_spans))
                            event.add_event_span(EventSpan(id, offset, text, event_type))
                        elif info == 'anchor':  # anchor span
                            id = '{}.t-{}'.format(event_id, len(event.anchors))
                            #print('Spannotator, adding ANCHOR with text "{}"'.format(text))
                            newtext, newoffset = remove_trailing_periods(text, offset)
                            if text != newtext:
                                print('- revising anchor, text=[%s] offset=(%d,%d) newtext=[%s] newoffset=(%d,%d)' % (text, offset.first, offset.second, newtext, newoffset.first, newoffset.second))
                            event.add_anchor(Anchor(id, newoffset, newtext, event_type))
                        elif '/' in info:  # argument span
                            em_id = 'm-{}-{}'.format(offset.first, offset.second)
                            newtext, newoffset = remove_trailing_periods(text, offset)
                            if text != newtext:
                                print('- revising argument, text=[%s] offset=(%d,%d) newtext=[%s] newoffset=(%d,%d)' % (text, offset.first, offset.second, newtext, newoffset.first, newoffset.second))
                            em = EntityMention(em_id, newoffset, newtext, 'dummy')
                            # we just use a dummy em first, for creating the EventArgument (notice that this em is not added to the doc)
                            # later, when we annotate sentence, we will find an actual EntityMention that is backed by tokens
                            # and use that to back the EventArgument
                            # Ref: text_theory.annotate_sentence_with_events()
                            arg_role = info[info.index('/') + 1:]
                            arg_id = '{}.t-{}'.format(event_id, len(event.arguments))
                            event.add_argument(EventArgument(arg_id, em, arg_role))

                i += 1
                line = lines[i]
            doc.add_event(event)
        i += 1
    return doc