Пример #1
0
 def __handleParagraph(self, xmlParagraph, paragraph):
     #p = Paragraph()
     p = paragraph
     p.idx = xmlParagraph.attributes['ID'].nodeValue
     
     if xmlParagraph.hasChildNodes():
         for xmlNode in xmlParagraph.childNodes:
             if xmlNode.nodeName == 'S':
                 s = Sentence()
                 print 'before handle_sentence'
                 self.__handleSentence( xmlNode, s )
                 print 'after handle_sentence'
                 p.addSentence(s)
                 s.paragraph = p
     
     print p, p.sentences
Пример #2
0
def _tab_separated_input_to_doc(input):
    # Create a dataset out of the input
    doc = Document(input.name, [], [], '<%s>' % input.name)
    for _string, _type in (l.rstrip('\n').split('\t') for l in input):
        doc.abstract.append(
            Sentence(_string, [
                Annotation(0, len(_string), _type),
            ]))
    return doc
Пример #3
0
def _get_sentences_and_offsets(txt_handle, ss_handle):
    s_starts_and_sentences = []
    txt_handle_reads = 0
    for s_text in (l.rstrip('\n') for l in ss_handle):
        # XXX: We allow multiple spaces to be aligned due to issues with the SS
        aligner = Aligner(unicode(s_text, encoding='utf-8'),
                          ignore_mult=set((' ', )))

        t_char = None
        started_at = txt_handle.tell()
        started_at_read = txt_handle_reads
        while True:
            t_char = unicode(txt_handle.read(1), encoding='utf-8')
            txt_handle_reads += 1
            if not t_char:
                assert False, ('could not align all sentences for: '
                               '"{}" and "{}" stopped at the sentence: "{}" '
                               'aligner in state: {}').format(
                                   txt_handle.name, ss_handle.name, s_text,
                                   aligner.__repr__())
            try:
                if aligner.align(t_char):
                    source_text = _str(aligner)

                    # We are aligned!
                    s_starts_and_sentences.append((
                        #txt_handle.tell() - len(source_text),
                        #started_at,
                        started_at_read,
                        Sentence(source_text, [])))
                    #last_end += aligner.char_cnt
                    break
            except MisalignedError:
                started_at = txt_handle.tell()
                started_at_read = txt_handle_reads
                pass

    #s_starts_and_sentences.sort()
    return s_starts_and_sentences
Пример #4
0
def _get_documents(dir):
    for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir):
        #print id
        # First we align the text and the sentences since we need to map the
        # offsets of the stand-off to map to the sentences in the sentence
        # split file
        #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file:
        with open(txt_path, 'r') as txt_file:
            if ENCODE_WRAP:
                txt_file = _encode_wrap(txt_file)
            with open(ss_path, 'r') as ss_file:
                if ENCODE_WRAP:
                    ss_file = _encode_wrap(ss_file)
                #sentences, s_offset_by_sentence = (
                s_starts_and_sentences = (_get_sentences_and_offsets(
                    txt_file, ss_file))

        #XXX: HACK!
        if a2_path is None:
            a2_path = '/dev/null'

        #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file:
        with open(a1_path, 'r') as a1_file:
            if ENCODE_WRAP:
                a1_file = _encode_wrap(a1_file)
            with open(a2_path, 'r') as a2_file:
                if ENCODE_WRAP:
                    a2_file = _encode_wrap(a2_file)
                for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)):
                    # We ignore everything apart from the text-bound annotations
                    match = TB_SO_REGEX.match(line)
                    if match is not None:
                        g_dict = match.groupdict()
                        ann_start = int(g_dict['start'])
                        ann_end = int(g_dict['end'])

                        # Find the sentence and its index containing the annotation
                        s_idx, sentence = _find_containing_idx(
                            ann_start, s_starts_and_sentences)

                        # XXX: There are cases where an annotation is cut-off
                        #       by a sentence break. If this is the case, merge
                        #       the sentences.
                        if ann_end > s_idx + len(sentence.text):
                            next_s_idx, next_sentence = _find_containing_idx(
                                ann_end, s_starts_and_sentences)
                            # Merge the next sentence into this one
                            # XXX: Just assumes a space! May be wrong!
                            sentence = Sentence(
                                sentence.text + ' ' + next_sentence.text,
                                sentence.annotations +
                                next_sentence.annotations)
                            # Remove the old one
                            s_starts_and_sentences.remove(
                                (next_s_idx, next_sentence))

                        # Create an annotation object but adjust the indices to
                        # be relative to the sentence and not to the file
                        new_ann_start = ann_start - s_idx
                        assert 0 <= new_ann_start < len(
                            sentence.text
                        ), '0 <= {} < {} ({}, {}) {} "{}" {}'.format(
                            new_ann_start, len(sentence.text), s_idx,
                            g_dict['start'], id, g_dict['text'], s_idx)
                        new_ann_end = ann_end - s_idx
                        assert 0 < new_ann_end <= len(
                            sentence.text
                        ), '0 < {} <= {} ({}, {}) {} {}'.format(
                            new_ann_end, len(sentence.text), s_idx,
                            g_dict['end'], id, g_dict['text'])
                        assert new_ann_start < new_ann_end
                        annotation = Annotation(ann_start - s_idx,
                                                ann_end - s_idx,
                                                g_dict['type'])

                        # If we have a text span in the stand-off we sanity check
                        # it against what is in the sentence
                        #XXX: Run this again!
                        if g_dict['text'] is not None:
                            g_dict['text'] = unicode(
                                g_dict['text'].strip('\r\n'),
                                encoding='utf-8')  #XXX: Regex is not perfect
                            # it leaves spaces around
                            target_ann_text = sentence.annotation_text(
                                annotation)
                            assert target_ann_text == g_dict['text'], (
                                'text span mismatch in {} '
                                'target: "{}" != source: "{}" {} "{}" {} {} {}'
                            ).format(id, target_ann_text, g_dict['text'],
                                     annotation, sentence.text, g_dict,
                                     type(target_ann_text),
                                     type(g_dict['text']))

                        sentence.add_annotation(annotation)
                    #else:
                    #    assert False, line.replace(' ', '\s').replace('\t', '\\t')

        yield Document(id, [],
                       [sentence for _, sentence in s_starts_and_sentences],
                       txt_path)
Пример #5
0
def _string_to_ann_sent(_string):
    return Sentence(_string, [Annotation(0, len(_string), None)])
Пример #6
0
def _get_documents(dir):
    for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir):
        #print id
        # First we align the text and the sentences since we need to map the
        # offsets of the stand-off to map to the sentences in the sentence
        # split file
        #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file:
        with open(txt_path, 'r') as txt_file:
            if ENCODE_WRAP:
                txt_file = _encode_wrap(txt_file)
            with open(ss_path, 'r') as ss_file:
                if ENCODE_WRAP:
                    ss_file = _encode_wrap(ss_file)
                #sentences, s_offset_by_sentence = (
                s_starts_and_sentences = (
                        _get_sentences_and_offsets(txt_file, ss_file))

        #XXX: HACK!
        if a2_path is None:
            a2_path = '/dev/null'

        #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file:
        with open(a1_path, 'r') as a1_file:
            if ENCODE_WRAP:
                a1_file = _encode_wrap(a1_file)
            with open(a2_path, 'r') as a2_file:
                if ENCODE_WRAP:
                    a2_file = _encode_wrap(a2_file)
                for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)):
                    # We ignore everything apart from the text-bound annotations
                    match = TB_SO_REGEX.match(line)
                    if match is not None:
                        g_dict = match.groupdict()
                        ann_start = int(g_dict['start'])
                        ann_end = int(g_dict['end'])

                        # Find the sentence and its index containing the annotation
                        s_idx, sentence = _find_containing_idx(ann_start,
                                s_starts_and_sentences)
                        
                        # XXX: There are cases where an annotation is cut-off
                        #       by a sentence break. If this is the case, merge
                        #       the sentences.
                        if ann_end > s_idx + len(sentence.text):
                            next_s_idx, next_sentence = _find_containing_idx(
                                    ann_end, s_starts_and_sentences)
                            # Merge the next sentence into this one
                            # XXX: Just assumes a space! May be wrong!
                            sentence = Sentence(sentence.text + ' ' + next_sentence.text,
                                    sentence.annotations + next_sentence.annotations)
                            # Remove the old one
                            s_starts_and_sentences.remove((next_s_idx, next_sentence))

                        # Create an annotation object but adjust the indices to
                        # be relative to the sentence and not to the file
                        new_ann_start = ann_start - s_idx
                        assert 0 <= new_ann_start < len(sentence.text), '0 <= {} < {} ({}, {}) {} "{}" {}'.format(
                                new_ann_start, len(sentence.text), s_idx, g_dict['start'], id, g_dict['text'], s_idx)
                        new_ann_end = ann_end - s_idx
                        assert 0 < new_ann_end <= len(sentence.text), '0 < {} <= {} ({}, {}) {} {}'.format(
                                new_ann_end, len(sentence.text), s_idx, g_dict['end'], id, g_dict['text'])
                        assert new_ann_start < new_ann_end
                        annotation = Annotation(
                                ann_start - s_idx, ann_end - s_idx, g_dict['type'])

                        # If we have a text span in the stand-off we sanity check
                        # it against what is in the sentence
                        #XXX: Run this again!
                        if g_dict['text'] is not None:
                            g_dict['text'] = unicode(g_dict['text'].strip('\r\n'), encoding='utf-8') #XXX: Regex is not perfect
                            # it leaves spaces around
                            target_ann_text = sentence.annotation_text(annotation)
                            assert target_ann_text == g_dict['text'], (
                                    'text span mismatch in {} '
                                    'target: "{}" != source: "{}" {} "{}" {} {} {}'
                                    ).format(id, target_ann_text, g_dict['text'],
                                            annotation, sentence.text, g_dict,
                                            type(target_ann_text), type(g_dict['text']))

                        sentence.add_annotation(annotation)
                    #else:
                    #    assert False, line.replace(' ', '\s').replace('\t', '\\t')

        yield Document(id, [],
                [sentence for _, sentence in s_starts_and_sentences],
                txt_path)