def _text_for_offsets(text, offsets):
    """
    Given a text and a list of (start, end) integer offsets, returns
    the (catenated) text corresponding to those offsets, joined
    appropriately for use in a TextBoundAnnotation(WithText).
    """
    try:
        return DISCONT_SEP.join(text[s:e] for s,e in offsets)
    except Exception:
        Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets))
        raise ProtocolArgumentError
示例#2
0
def _text_for_offsets(text, offsets):
    """Given a text and a list of (start, end) integer offsets, returns the
    (catenated) text corresponding to those offsets, joined appropriately for
    use in a TextBoundAnnotation(WithText)."""
    try:
        return DISCONT_SEP.join(text[s:e] for s, e in offsets)
    except Exception:
        Messager.error(
            '_text_for_offsets: failed to get text for given offsets (%s)' %
            str(offsets))
        raise ProtocolArgumentError
示例#3
0
def __create_span(ann_obj, mods, type, offsets, txt_file_path,
                  projectconf, attributes):
    # For event types, reuse trigger if a matching one exists.
    found = None
    if projectconf.is_event_type(type):
        for tb_ann in ann_obj.get_textbounds():
            try:
                if (_offsets_equal(tb_ann.spans, offsets)
                        and tb_ann.type == type):
                    found = tb_ann
                    break
            except AttributeError:
                # Not a trigger then
                pass

    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T')  # XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            text = txt_file.read()
            text_span = _text_for_offsets(text, offsets)

        # The below code resolves cases where there are newlines in the
        #   offsets by creating discontinuous annotations for each span
        #   separated by newlines. For most cases it preserves the offsets.
        seg_offsets = []
        for o_start, o_end in offsets:
            pos = o_start
            for text_seg in text_span.split('\n'):
                if not text_seg and o_start != o_end:
                    # Double new-line, skip ahead
                    pos += 1
                    continue
                start = pos
                end = start + len(text_seg)

                # For the next iteration the position is after the newline.
                pos = end + 1

                # Adjust the offsets to compensate for any potential leading
                #   and trailing whitespace.
                start += len(text_seg) - len(text_seg.lstrip())
                end -= len(text_seg) - len(text_seg.rstrip())

                # If there is any segment left, add it to the offsets.
                if start != end:
                    seg_offsets.append((start, end, ))

        # if we're dealing with a null-span
        if not seg_offsets:
            seg_offsets = offsets

        ann_text = DISCONT_SEP.join((text[start:end]
                                     for start, end in seg_offsets))
        ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, ann_text)
        ann_obj.add_annotation(ann)
        mods.addition(ann)
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E')  # XXX: Cons
            event = EventAnnotation(
                ann.id, [], str(new_event_id), type, '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event