def __handleParagraph(self, xmlParagraph, paragraph): #p = Paragraph() p = paragraph p.idx = xmlParagraph.attributes['ID'].nodeValue if xmlParagraph.hasChildNodes(): for xmlNode in xmlParagraph.childNodes: if xmlNode.nodeName == 'S': s = Sentence() print 'before handle_sentence' self.__handleSentence( xmlNode, s ) print 'after handle_sentence' p.addSentence(s) s.paragraph = p print p, p.sentences
def _tab_separated_input_to_doc(input): # Create a dataset out of the input doc = Document(input.name, [], [], '<%s>' % input.name) for _string, _type in (l.rstrip('\n').split('\t') for l in input): doc.abstract.append( Sentence(_string, [ Annotation(0, len(_string), _type), ])) return doc
def _get_sentences_and_offsets(txt_handle, ss_handle): s_starts_and_sentences = [] txt_handle_reads = 0 for s_text in (l.rstrip('\n') for l in ss_handle): # XXX: We allow multiple spaces to be aligned due to issues with the SS aligner = Aligner(unicode(s_text, encoding='utf-8'), ignore_mult=set((' ', ))) t_char = None started_at = txt_handle.tell() started_at_read = txt_handle_reads while True: t_char = unicode(txt_handle.read(1), encoding='utf-8') txt_handle_reads += 1 if not t_char: assert False, ('could not align all sentences for: ' '"{}" and "{}" stopped at the sentence: "{}" ' 'aligner in state: {}').format( txt_handle.name, ss_handle.name, s_text, aligner.__repr__()) try: if aligner.align(t_char): source_text = _str(aligner) # We are aligned! s_starts_and_sentences.append(( #txt_handle.tell() - len(source_text), #started_at, started_at_read, Sentence(source_text, []))) #last_end += aligner.char_cnt break except MisalignedError: started_at = txt_handle.tell() started_at_read = txt_handle_reads pass #s_starts_and_sentences.sort() return s_starts_and_sentences
def _get_documents(dir): for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir): #print id # First we align the text and the sentences since we need to map the # offsets of the stand-off to map to the sentences in the sentence # split file #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file: with open(txt_path, 'r') as txt_file: if ENCODE_WRAP: txt_file = _encode_wrap(txt_file) with open(ss_path, 'r') as ss_file: if ENCODE_WRAP: ss_file = _encode_wrap(ss_file) #sentences, s_offset_by_sentence = ( s_starts_and_sentences = (_get_sentences_and_offsets( txt_file, ss_file)) #XXX: HACK! if a2_path is None: a2_path = '/dev/null' #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file: with open(a1_path, 'r') as a1_file: if ENCODE_WRAP: a1_file = _encode_wrap(a1_file) with open(a2_path, 'r') as a2_file: if ENCODE_WRAP: a2_file = _encode_wrap(a2_file) for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)): # We ignore everything apart from the text-bound annotations match = TB_SO_REGEX.match(line) if match is not None: g_dict = match.groupdict() ann_start = int(g_dict['start']) ann_end = int(g_dict['end']) # Find the sentence and its index containing the annotation s_idx, sentence = _find_containing_idx( ann_start, s_starts_and_sentences) # XXX: There are cases where an annotation is cut-off # by a sentence break. If this is the case, merge # the sentences. if ann_end > s_idx + len(sentence.text): next_s_idx, next_sentence = _find_containing_idx( ann_end, s_starts_and_sentences) # Merge the next sentence into this one # XXX: Just assumes a space! May be wrong! sentence = Sentence( sentence.text + ' ' + next_sentence.text, sentence.annotations + next_sentence.annotations) # Remove the old one s_starts_and_sentences.remove( (next_s_idx, next_sentence)) # Create an annotation object but adjust the indices to # be relative to the sentence and not to the file new_ann_start = ann_start - s_idx assert 0 <= new_ann_start < len( sentence.text ), '0 <= {} < {} ({}, {}) {} "{}" {}'.format( new_ann_start, len(sentence.text), s_idx, g_dict['start'], id, g_dict['text'], s_idx) new_ann_end = ann_end - s_idx assert 0 < new_ann_end <= len( sentence.text ), '0 < {} <= {} ({}, {}) {} {}'.format( new_ann_end, len(sentence.text), s_idx, g_dict['end'], id, g_dict['text']) assert new_ann_start < new_ann_end annotation = Annotation(ann_start - s_idx, ann_end - s_idx, g_dict['type']) # If we have a text span in the stand-off we sanity check # it against what is in the sentence #XXX: Run this again! if g_dict['text'] is not None: g_dict['text'] = unicode( g_dict['text'].strip('\r\n'), encoding='utf-8') #XXX: Regex is not perfect # it leaves spaces around target_ann_text = sentence.annotation_text( annotation) assert target_ann_text == g_dict['text'], ( 'text span mismatch in {} ' 'target: "{}" != source: "{}" {} "{}" {} {} {}' ).format(id, target_ann_text, g_dict['text'], annotation, sentence.text, g_dict, type(target_ann_text), type(g_dict['text'])) sentence.add_annotation(annotation) #else: # assert False, line.replace(' ', '\s').replace('\t', '\\t') yield Document(id, [], [sentence for _, sentence in s_starts_and_sentences], txt_path)
def _string_to_ann_sent(_string): return Sentence(_string, [Annotation(0, len(_string), None)])
def _get_documents(dir): for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir): #print id # First we align the text and the sentences since we need to map the # offsets of the stand-off to map to the sentences in the sentence # split file #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file: with open(txt_path, 'r') as txt_file: if ENCODE_WRAP: txt_file = _encode_wrap(txt_file) with open(ss_path, 'r') as ss_file: if ENCODE_WRAP: ss_file = _encode_wrap(ss_file) #sentences, s_offset_by_sentence = ( s_starts_and_sentences = ( _get_sentences_and_offsets(txt_file, ss_file)) #XXX: HACK! if a2_path is None: a2_path = '/dev/null' #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file: with open(a1_path, 'r') as a1_file: if ENCODE_WRAP: a1_file = _encode_wrap(a1_file) with open(a2_path, 'r') as a2_file: if ENCODE_WRAP: a2_file = _encode_wrap(a2_file) for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)): # We ignore everything apart from the text-bound annotations match = TB_SO_REGEX.match(line) if match is not None: g_dict = match.groupdict() ann_start = int(g_dict['start']) ann_end = int(g_dict['end']) # Find the sentence and its index containing the annotation s_idx, sentence = _find_containing_idx(ann_start, s_starts_and_sentences) # XXX: There are cases where an annotation is cut-off # by a sentence break. If this is the case, merge # the sentences. if ann_end > s_idx + len(sentence.text): next_s_idx, next_sentence = _find_containing_idx( ann_end, s_starts_and_sentences) # Merge the next sentence into this one # XXX: Just assumes a space! May be wrong! sentence = Sentence(sentence.text + ' ' + next_sentence.text, sentence.annotations + next_sentence.annotations) # Remove the old one s_starts_and_sentences.remove((next_s_idx, next_sentence)) # Create an annotation object but adjust the indices to # be relative to the sentence and not to the file new_ann_start = ann_start - s_idx assert 0 <= new_ann_start < len(sentence.text), '0 <= {} < {} ({}, {}) {} "{}" {}'.format( new_ann_start, len(sentence.text), s_idx, g_dict['start'], id, g_dict['text'], s_idx) new_ann_end = ann_end - s_idx assert 0 < new_ann_end <= len(sentence.text), '0 < {} <= {} ({}, {}) {} {}'.format( new_ann_end, len(sentence.text), s_idx, g_dict['end'], id, g_dict['text']) assert new_ann_start < new_ann_end annotation = Annotation( ann_start - s_idx, ann_end - s_idx, g_dict['type']) # If we have a text span in the stand-off we sanity check # it against what is in the sentence #XXX: Run this again! if g_dict['text'] is not None: g_dict['text'] = unicode(g_dict['text'].strip('\r\n'), encoding='utf-8') #XXX: Regex is not perfect # it leaves spaces around target_ann_text = sentence.annotation_text(annotation) assert target_ann_text == g_dict['text'], ( 'text span mismatch in {} ' 'target: "{}" != source: "{}" {} "{}" {} {} {}' ).format(id, target_ann_text, g_dict['text'], annotation, sentence.text, g_dict, type(target_ann_text), type(g_dict['text'])) sentence.add_annotation(annotation) #else: # assert False, line.replace(' ', '\s').replace('\t', '\\t') yield Document(id, [], [sentence for _, sentence in s_starts_and_sentences], txt_path)