示例#1
0
 def load(filename):
     """Construct a new Story on the basis of some input file."""
     filepath, _ = os.path.splitext(filename)
     id = os.path.basename(filepath)
     characters, locations, entities = read_annotation_file(
         filepath + '.ann')
     characters = [Entity(i, character)
                   for i, character in enumerate(characters)]
     locations = [Entity(i, location)
                  for i, location in enumerate(locations)]
     scenes = []
     with codecs.open(filepath + '.txt', encoding='utf-8') as infile:
         for start, end in regex_sentence_boundary_gen(infile.read()):
             scenes.append(Scene(start, end))
     for scene in scenes:
         for character in characters:
             for mention, _, _, _ in character.chain:
                 if (entities[mention].start >= scene.start and
                     entities[mention].end <= scene.end):
                     scene.characters.add(character)
         for location in locations:
             for mention, _, _, _ in location.chain:
                 if (entities[mention].start >= scene.start and
                     entities[mention].end <= scene.end):
                     scene.locations.add(location)
     return Story(id, filepath, scenes, set(characters), set(locations))
def generate_sentence_boundaries(doc):
    offsets = []
    for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc):
        # Skip empty lines
        if doc[start_offset:end_offset].strip():
            offsets.append((start_offset, end_offset))
    return offsets
示例#3
0
def sentencebreaks_to_newlines(text):

    junk_mark = False

    offsets = [o for o in regex_sentence_boundary_gen(text)]

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum - 1:
            orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

            if (offsets[i][1] < offsets[i + 1][0] and
                    text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append(
                    '\n' + text[offsets[i][1] + 1:offsets[i + 1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check

    if text != ''.join(orig_parts):
        print("INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (text, ''.join(orig_parts)))
        junk_mark = True

    splittext = ''.join(new_parts)

    # sanity

    if len(text) != len(splittext):
        print("INTERNAL ERROR")
        junk_mark = True

    if _normspace(text) != _normspace(splittext):
        print("INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (_normspace(text), _normspace(splittext)))
        junk_mark = True

    return splittext, junk_mark
def sentencebreaks_to_newlines(text):
    line_offset = 1
    if "\r\n" in text:
        line_offset = 2
    offsets = [o for o in regex_sentence_boundary_gen(text)]

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum - 1:
            orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

            if (offsets[i][1] < offsets[i + 1][0]
                    and text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append('\n' + text[offsets[i][1] +
                                             line_offset:offsets[i + 1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(
        orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(
        splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            _normspace(text), _normspace(splittext))

    return splittext
示例#5
0
def generate_sentence_boundaries(doc):
    offsets = []
    for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc):
        # Skip empty lines
        if doc[start_offset:end_offset].strip():

            while doc[start_offset] == " ":
                start_offset += 1

            while doc[end_offset - 1] == " ":
                end_offset -= 1

            assert start_offset < end_offset

            offsets.append((start_offset, end_offset))

    return offsets
示例#6
0
def sentencebreaks_to_newlines(text):
    offsets = [o for o in regex_sentence_boundary_gen(text)]

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum - 1:
            orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

            if (offsets[i][1] < offsets[i + 1][0] and
                    text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append(
                    '\n' + text[offsets[i][1] + 1:offsets[i + 1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
        text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
        _normspace(text), _normspace(splittext))

    return splittext
示例#7
0
def build_text_structure(ann,txt_file_path):    
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after 
    it took the words out. 
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:
                
                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))
            
    try:
            #Sort entities on offset instead of id        
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')
            
            attributes = build_entities_attr(ann)
                    
            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) 
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) 
    
            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])
            
            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]

            
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""                
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:                                    
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]    
                        else:
                            break    
                    entity = entities[index]
                    inner_index = 0    
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity    
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen                
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt                    
                            if entities_words[inner_index]:        
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break
                                    
                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1                            
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break    
            add_list_entities(paragraph, folia_entitiesLayer_par)    
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)        
            return doc
    except IOError:
        pass # Most likely a broken pipe
示例#8
0
def build_text_structure(ann,txt_file_path):
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after
    it took the words out.
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:

                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))

    try:
            #Sort entities on offset instead of id
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')

            attributes = build_entities_attr(ann)

            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) )
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) )

            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])

            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]


            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]
                        else:
                            break
                    entity = entities[index]
                    inner_index = 0
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt
                            if entities_words[inner_index]:
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break

                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break
            add_list_entities(paragraph, folia_entitiesLayer_par)
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)
            return doc
    except IOError:
        pass # Most likely a broken pipe