Пример #1
0
def _get_documents(dir):
    for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir):
        #print id
        # First we align the text and the sentences since we need to map the
        # offsets of the stand-off to map to the sentences in the sentence
        # split file
        #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file:
        with open(txt_path, 'r') as txt_file:
            if ENCODE_WRAP:
                txt_file = _encode_wrap(txt_file)
            with open(ss_path, 'r') as ss_file:
                if ENCODE_WRAP:
                    ss_file = _encode_wrap(ss_file)
                #sentences, s_offset_by_sentence = (
                s_starts_and_sentences = (_get_sentences_and_offsets(
                    txt_file, ss_file))

        #XXX: HACK!
        if a2_path is None:
            a2_path = '/dev/null'

        #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file:
        with open(a1_path, 'r') as a1_file:
            if ENCODE_WRAP:
                a1_file = _encode_wrap(a1_file)
            with open(a2_path, 'r') as a2_file:
                if ENCODE_WRAP:
                    a2_file = _encode_wrap(a2_file)
                for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)):
                    # We ignore everything apart from the text-bound annotations
                    match = TB_SO_REGEX.match(line)
                    if match is not None:
                        g_dict = match.groupdict()
                        ann_start = int(g_dict['start'])
                        ann_end = int(g_dict['end'])

                        # Find the sentence and its index containing the annotation
                        s_idx, sentence = _find_containing_idx(
                            ann_start, s_starts_and_sentences)

                        # XXX: There are cases where an annotation is cut-off
                        #       by a sentence break. If this is the case, merge
                        #       the sentences.
                        if ann_end > s_idx + len(sentence.text):
                            next_s_idx, next_sentence = _find_containing_idx(
                                ann_end, s_starts_and_sentences)
                            # Merge the next sentence into this one
                            # XXX: Just assumes a space! May be wrong!
                            sentence = Sentence(
                                sentence.text + ' ' + next_sentence.text,
                                sentence.annotations +
                                next_sentence.annotations)
                            # Remove the old one
                            s_starts_and_sentences.remove(
                                (next_s_idx, next_sentence))

                        # Create an annotation object but adjust the indices to
                        # be relative to the sentence and not to the file
                        new_ann_start = ann_start - s_idx
                        assert 0 <= new_ann_start < len(
                            sentence.text
                        ), '0 <= {} < {} ({}, {}) {} "{}" {}'.format(
                            new_ann_start, len(sentence.text), s_idx,
                            g_dict['start'], id, g_dict['text'], s_idx)
                        new_ann_end = ann_end - s_idx
                        assert 0 < new_ann_end <= len(
                            sentence.text
                        ), '0 < {} <= {} ({}, {}) {} {}'.format(
                            new_ann_end, len(sentence.text), s_idx,
                            g_dict['end'], id, g_dict['text'])
                        assert new_ann_start < new_ann_end
                        annotation = Annotation(ann_start - s_idx,
                                                ann_end - s_idx,
                                                g_dict['type'])

                        # If we have a text span in the stand-off we sanity check
                        # it against what is in the sentence
                        #XXX: Run this again!
                        if g_dict['text'] is not None:
                            g_dict['text'] = unicode(
                                g_dict['text'].strip('\r\n'),
                                encoding='utf-8')  #XXX: Regex is not perfect
                            # it leaves spaces around
                            target_ann_text = sentence.annotation_text(
                                annotation)
                            assert target_ann_text == g_dict['text'], (
                                'text span mismatch in {} '
                                'target: "{}" != source: "{}" {} "{}" {} {} {}'
                            ).format(id, target_ann_text, g_dict['text'],
                                     annotation, sentence.text, g_dict,
                                     type(target_ann_text),
                                     type(g_dict['text']))

                        sentence.add_annotation(annotation)
                    #else:
                    #    assert False, line.replace(' ', '\s').replace('\t', '\\t')

        yield Document(id, [],
                       [sentence for _, sentence in s_starts_and_sentences],
                       txt_path)
Пример #2
0
def _get_documents(dir):
    for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir):
        #print id
        # First we align the text and the sentences since we need to map the
        # offsets of the stand-off to map to the sentences in the sentence
        # split file
        #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file:
        with open(txt_path, 'r') as txt_file:
            if ENCODE_WRAP:
                txt_file = _encode_wrap(txt_file)
            with open(ss_path, 'r') as ss_file:
                if ENCODE_WRAP:
                    ss_file = _encode_wrap(ss_file)
                #sentences, s_offset_by_sentence = (
                s_starts_and_sentences = (
                        _get_sentences_and_offsets(txt_file, ss_file))

        #XXX: HACK!
        if a2_path is None:
            a2_path = '/dev/null'

        #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file:
        with open(a1_path, 'r') as a1_file:
            if ENCODE_WRAP:
                a1_file = _encode_wrap(a1_file)
            with open(a2_path, 'r') as a2_file:
                if ENCODE_WRAP:
                    a2_file = _encode_wrap(a2_file)
                for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)):
                    # We ignore everything apart from the text-bound annotations
                    match = TB_SO_REGEX.match(line)
                    if match is not None:
                        g_dict = match.groupdict()
                        ann_start = int(g_dict['start'])
                        ann_end = int(g_dict['end'])

                        # Find the sentence and its index containing the annotation
                        s_idx, sentence = _find_containing_idx(ann_start,
                                s_starts_and_sentences)
                        
                        # XXX: There are cases where an annotation is cut-off
                        #       by a sentence break. If this is the case, merge
                        #       the sentences.
                        if ann_end > s_idx + len(sentence.text):
                            next_s_idx, next_sentence = _find_containing_idx(
                                    ann_end, s_starts_and_sentences)
                            # Merge the next sentence into this one
                            # XXX: Just assumes a space! May be wrong!
                            sentence = Sentence(sentence.text + ' ' + next_sentence.text,
                                    sentence.annotations + next_sentence.annotations)
                            # Remove the old one
                            s_starts_and_sentences.remove((next_s_idx, next_sentence))

                        # Create an annotation object but adjust the indices to
                        # be relative to the sentence and not to the file
                        new_ann_start = ann_start - s_idx
                        assert 0 <= new_ann_start < len(sentence.text), '0 <= {} < {} ({}, {}) {} "{}" {}'.format(
                                new_ann_start, len(sentence.text), s_idx, g_dict['start'], id, g_dict['text'], s_idx)
                        new_ann_end = ann_end - s_idx
                        assert 0 < new_ann_end <= len(sentence.text), '0 < {} <= {} ({}, {}) {} {}'.format(
                                new_ann_end, len(sentence.text), s_idx, g_dict['end'], id, g_dict['text'])
                        assert new_ann_start < new_ann_end
                        annotation = Annotation(
                                ann_start - s_idx, ann_end - s_idx, g_dict['type'])

                        # If we have a text span in the stand-off we sanity check
                        # it against what is in the sentence
                        #XXX: Run this again!
                        if g_dict['text'] is not None:
                            g_dict['text'] = unicode(g_dict['text'].strip('\r\n'), encoding='utf-8') #XXX: Regex is not perfect
                            # it leaves spaces around
                            target_ann_text = sentence.annotation_text(annotation)
                            assert target_ann_text == g_dict['text'], (
                                    'text span mismatch in {} '
                                    'target: "{}" != source: "{}" {} "{}" {} {} {}'
                                    ).format(id, target_ann_text, g_dict['text'],
                                            annotation, sentence.text, g_dict,
                                            type(target_ann_text), type(g_dict['text']))

                        sentence.add_annotation(annotation)
                    #else:
                    #    assert False, line.replace(' ', '\s').replace('\t', '\\t')

        yield Document(id, [],
                [sentence for _, sentence in s_starts_and_sentences],
                txt_path)