Python replace_unicode_whitespaces_with_ascii_whitespace 예제들, utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace Python 예제들

예제 #1

0

파일 보기

파일: brat_to_conll2.py 프로젝트: soares-f/PharmacoNER

def get_pos_tags_from_brat(text,annotation_filepath2, verbose=False):
    # parse annotation file
    pos_tags = []
    with codecs.open(annotation_filepath2, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                pos_tag = {}
                pos_tag['id'] = id_anno
                pos_tag['type'] = anno[1] # tag
                pos_tag['start'] = int(anno[2])
                pos_tag['end'] = int(anno[3])
                pos_tag['text'] = ' '.join(anno[4:])
                if verbose:
                    print("pos_tag: {0}".format(pos_tag))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[pos_tag['start']:pos_tag['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(pos_tag['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(text[pos_tag['start']:pos_tag['end']]))
                    print("\tanno: {0}".format(pos_tag['text']))
                    print("In:",annotation_filepath2)
                    #exit()
                    input("Press Enter to continue...")
                # add to entitys data
                pos_tags.append(pos_tag['type'])
    if verbose: print("\n\n")
    
    return pos_tags

예제 #2

0

파일 보기

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
    # load text
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text =f.read()
    if verbose: print("\ntext:\n{0}\n".format(text))

    # parse annotation file
    entities = []
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                if verbose:
                    print("entity: {0}".format(entity))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                # add to entitys data
                entities.append(entity)
    if verbose: print("\n\n")
    
    return text, entities

예제 #3

0

파일 보기

파일: brat_to_conll.py 프로젝트: hmchuong/NeuroNER

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
    """
    Lấy tuple gồm text và entities từ file text và annotation theo chuẩn brat
    Xem ví dụ trong data/example_unannotated_texts/done/000-introduction.ann -> annotation_filepath
    và data/example_unannotated_texts/done/000-introduction.txt -> text_filepath
    Kết quả sẽ có dạng:
    BRAT format có dạng: <id> <type> <start> <end> <text>
    Lưu ý chỉ lấy annotation có id dạng T*
    (<text>,[
        {
            "id":...,
            "type": ...,
            "start": ...,
            "end": ...,
            "text": ...,
        }
    ])
    """
    # Load file text lên
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text = f.read()
    if verbose: print("\ntext:\n{0}\n".format(text))

    # parse annotation file
    entities = []
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                if verbose:
                    print("entity: {0}".format(entity))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(
                        text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                # add to entitys data
                entities.append(entity)
    if verbose: print("\n\n")

    return text, entities

예제 #4

0

파일 보기

파일: brat_to_conll_compatible_tokenization.py 프로젝트: soares-f/PharmacoNER

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
    # load text
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text = f.read()
    if verbose: print("\ntext:\n{0}\n".format(text))

    # parse annotation file
    entities = []
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        malformatted = False
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                # WORKAROUND FOR WRONG ANNOTATIONS COMMING FROM UMLS/MANTRA:
                # The END offset should be -1 if the next char is puntuation sign.
                if ntpath.basename(annotation_filepath)[0] == 'd':
                    if entity['text'].strip() in ['\n', '\t', ' ', '']:
                        continue
                    if entity['end'] > len(text):
                        entity['end'] = len(text)
                    if text[entity['end'] -
                            1] == '.' and entity['end'] - entity['start'] > 1:
                        entity['end'] = entity['end'] - 1
                        if entity['text'][-1] == '.':
                            entity['text'] = entity['text'][:-1]
                if verbose:
                    print("entity: {0}".format(entity))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(
                        text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                    print("In:", annotation_filepath)
                    #exit()
                    input("Press Enter to continue...")
                    malformatted = True
                # add to entitys data
                entities.append(entity)
    if verbose: print("\n\n")
    return text, entities, malformatted

예제 #5

0

파일 보기

파일: conll_to_brat.py 프로젝트: hmchuong/NeuroNER

def output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=False):
    """
    Dùng để lưu entities xuống theo chuẩn brat, ngược với hàm brat_to_conll.get_entities_from_brat
    brat_output_folder: folder để lưu 2 file .txt và .ann
    previous_filename: tên muốn lưu cho file .ann
    entities: cấu trúc entities được định nghĩa trong hàm brat_to_conll.get_entities_from_brat
    text_file_path: đường dẫn file text .txt
    text: nội dung text

    """
    if previous_filename == '':
        return
    output_filepath = os.path.join(brat_output_folder, '{0}.ann'.format(previous_filename))
    if not overwrite:
        # Avoid overriding existing annotation
        if os.path.exists(output_filepath) and os.path.getsize(output_filepath) > 0:
            raise AssertionError("The annotation already exists at: {0}".format(output_filepath))
    # Write the entities to the annotation file
    with codecs.open(output_filepath, 'w', 'utf-8') as output_file:
        for i, entity in enumerate(entities):
            output_file.write('T{0}\t{1} {2} {3}\t{4}\n'.format(i+1, entity['label'], entity['start'], entity['end'],
                                                           utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']])))
    # Copy the corresponding text file
    if text_filepath != os.path.join(brat_output_folder, os.path.basename(text_filepath)):
        shutil.copy(text_filepath, brat_output_folder)

예제 #6

0

파일 보기

def output_entities(brat_output_folder,
                    previous_filename,
                    entities,
                    text_filepath,
                    text,
                    overwrite=False):
    if previous_filename == '':
        return
    output_filepath = os.path.join(brat_output_folder,
                                   '{0}.ann'.format(previous_filename))
    if not overwrite:
        # Avoid overriding existing annotation
        if os.path.exists(
                output_filepath) and os.path.getsize(output_filepath) > 0:
            raise AssertionError(
                "The annotation already exists at: {0}".format(
                    output_filepath))
    # Write the entities to the annotation file
    with codecs.open(output_filepath, 'w', 'utf-8') as output_file:
        for i, entity in enumerate(entities):
            output_file.write('T{0}\t{1} {2} {3}\t{4}\n'.format(
                i + 1, entity['label'], entity['start'], entity['end'],
                utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(
                    text[entity['start']:entity['end']])))
    # Copy the corresponding text file
    if text_filepath != os.path.join(brat_output_folder,
                                     os.path.basename(text_filepath)):
        shutil.copy(text_filepath, brat_output_folder)

예제 #7

0

파일 보기

파일: brat_to_conll.py 프로젝트: zilongzhong/NeuroNER

def check_brat_annotation_and_text_compatibility(brat_folder):
    '''
    Check if brat annotation and text files are compatible.
    '''
    dataset_type =  os.path.basename(brat_folder)
    print("Checking the validity of BRAT-formatted {0} set... ".format(dataset_type), end='')
    text_filepaths = sorted(glob.glob(os.path.join(brat_folder, '*.txt')))
    for text_filepath in text_filepaths:
        base_filename = os.path.basename(text_filepath).split('.')[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann')
        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'UTF-8').close()
        with codecs.open(text_filepath, 'r', 'UTF-8') as f:
            text =f.read()
        # parse annotation file
        entities = []
        with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
            for line in f.read().splitlines():
                anno = line.split()
                id_anno = anno[0]
                # parse entity
                if id_anno[0] == 'T':
                    entity = {}
                    entity['id'] = id_anno
                    entity['type'] = anno[1]
                    entity['start'] = int(anno[2])
                    try:
                        entity['end'] = int(anno[3])
                    except:
                        entity['end'] = int(anno[4])
                        entity['text'] = ' '.join(anno[5:])
                    else:
                        entity['text'] = ' '.join(anno[4:])
                    # Check compatibility between brat text and anootation
#                     if text[entity['start']:entity['end']] != entity['text']:
                    if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                        utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                        print("Warning: brat text and annotation do not match.")
                        print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                        print("\tanno: {0}".format(entity['text']))
    print("Done.")

예제 #8

0

파일 보기

파일: brat_to_conll_compatible_tokenization.py 프로젝트: soares-f/PharmacoNER

def get_sentences_from_pos_annotations(text,
                                       annotation_filepath2,
                                       verbose=False):
    # parse annotation file
    sentences = []
    sentence = []
    with codecs.open(annotation_filepath2, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            if id_anno[0] == 'T':
                token_dict = {}
                token_dict['start'] = int(anno[2])
                token_dict['end'] = int(anno[3])
                token_dict['text'] = ' '.join(anno[4:])
                if token_dict['text'].strip() in ['\n', '\t', ' ', '']:
                    continue
                if verbose:
                    print("token_dict: {0}".format(token_dict))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[token_dict['start']:token_dict['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(token_dict['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(
                        text[token_dict['start']:token_dict['end']]))
                    print("\tanno: {0}".format(token_dict['text']))
                    print("In:", annotation_filepath2)
                    #exit()
                    input("Press Enter to continue...")
                # add to entitys data
                sentence.append(token_dict)
                if token_dict['text'] == '.':
                    sentences.append(sentence)
                    sentence = []
    if verbose: print("\n\n")
    return sentences

예제 #9

0

파일 보기

파일: brat_to_conll.py 프로젝트: zilongzhong/NeuroNER

def brat_to_conll(input_folder, output_filepath):
    '''
    Assumes '.txt' and '.ann' files are in the input_folder.
    Checks for the compatibility between .txt and .ann at the same time.
    '''
    spacy_nlp = spacy.load('en')
    verbose = False
    dataset_type =  os.path.basename(input_folder)
    print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='')
    text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt')))
    output_file = codecs.open(output_filepath, 'w', 'utf-8')
    for text_filepath in text_filepaths:
        base_filename = os.path.basename(text_filepath).split('.')[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann')
        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'UTF-8').close()

        # load text
        with codecs.open(text_filepath, 'r', 'UTF-8') as f:
            text =f.read()
        if verbose: print("text: {0}".format(text))

        # parse annotation file
        entities = []
        with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
            for line in f.read().splitlines():
                anno = line.split()
                id_anno = anno[0]
                # parse entity
                if id_anno[0] == 'T':
                    entity = {}
                    entity['id'] = id_anno
                    entity['type'] = anno[1]
                    entity['start'] = int(anno[2])
                    try:
                        entity['end'] = int(anno[3])
                    except:
                        entity['end'] = int(anno[4])
                        entity['text'] = ' '.join(anno[5:])
                    else:
                        entity['text'] = ' '.join(anno[4:])
                    if verbose:
                        print("entity: {0}".format(entity))
                    # Check compatibility between brat text and anootation
                    if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                        utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                        print("Warning: brat text and annotation do not match.")
                        print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                        print("\tanno: {0}".format(entity['text']))
                    # add to entitys data
                    entities.append(entity)

        sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp)
        for sentence in sentences:
            inside=False
            for token in sentence:
                token['label'] = 'O'
                for entity in entities:
                    if entity['start'] <= token['start'] < entity['end'] or \
                       entity['start'] < token['end'] <= entity['end'] or \
                       token['start'] < entity['start'] < entity['end'] < token['end']:

                        token['label'] = entity['type'].replace('-', '_') # Because the ANN doesn't support tag with '-' in it

                        break
                if len(entities) == 0:
                    entity={'end':0}
                if token['label'] == 'O':
                    gold_label = 'O'
                    inside = False
                elif inside:
                    gold_label = 'I-{0}'.format(token['label'])
                else:
                    inside = True
                    gold_label = 'B-{0}'.format(token['label'])
                if token['end'] == entity['end']:
                    inside = False
                if verbose: print('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label))
                output_file.write('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label))
            if verbose: print('\n')
            output_file.write('\n')

    output_file.close()
    print('Done.')
    del spacy_nlp

예제 #10

0

파일 보기

def fix_one(text_filepath):
    # load text
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text = f.read()
    base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
    annotation_filepath = os.path.join(os.path.dirname(text_filepath),
                                       base_filename + '.ann')
    ann = ''
    malformatted = False
    original = ''
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        original = f.read()
        for line in original.splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                # WORKAROUND FOR WRONG ANNOTATIONS COMMING FROM UMLS/MANTRA:
                # The END offset should be -1 if the next char is puntuation sign.
                '''
                if ntpath.basename(base_filename)[0] == 'd':
                    if entity['text'].strip() in ['\n', '\t', ' ', '']:
                        empty = True
                        malformatted = True
                    if not empty and entity['end'] > len(text):
                        entity['end'] = len(text)
                        malformatted = True
                    if not empty and text[entity['end']-1] == '.' and entity['end'] - entity['start'] > 1:
                        entity['end'] = entity['end']-1
                        malformatted = True
                        if entity['text'][-1] == '.':
                            entity['text'] = entity['text'][:-1]
                    while not empty and entity['end'] < len(text) and text[entity['end']] not in ['\n', '\t', ' ', ',',';','(',')','.',':','?',"'",'"','/','\\','-','”','“']:
                        malformatted = True
                        entity['text'] = entity['text']+text[entity['end']]
                        entity['end'] += 1
                    while not empty and entity['start'] > 0 and text[entity['start']-1] not in ['\n', '\t', ' ', ',',';','(',')','.',':','?',"'",'"','/','\\','-','”','“']:
                        malformatted = True
                        entity['text'] = text[entity['start']-1]+entity['text']
                        entity['start'] = -1
                    if not any(c.isalpha() for c in entity['text']) or len(entity['text'].split()) > 3 or '/' in entity['text'] or '\\' in entity['text']:
                        malformatted = True
                        empty = True
                        print('hey')
                '''
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    malformatted = True
                    '''
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                    print("In:",annotation_filepath)
                    '''
                    #exit()
                    #input("Press Enter to continue...")
                elif bad_text(text[entity['start']:entity['end']]):
                    malformatted = True
                    '''
                    print("Warning: Bad text")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                    print("In:",annotation_filepath)
                    print('Text: ')
                    print(text)
                    print()
                    print('Original:')
                    print(original)
                    print()
                    '''
                    #exit()
                    #input("Press Enter to continue...")
                '''
                #elif False:#else:
                    print("OKAY")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                    print("In:",annotation_filepath)
                    print('Text: ')
                    print(text)
                    print()
                    print('Original:')
                    print(original)
                    print()
                    #exit()
                    #input("Press Enter to continue...")
                '''
        return malformatted
        if malformatted and ann != '':
            print('FIXED a malformatted annotation: ', annotation_filepath)
            print('Text: ')
            print(text)
            print()
            print('Original:')
            print(original)
            print()
            print('Fixed:')
            print(ann)
            input('Press enter to continue')
            '''
            with codecs.open(annotation_filepath, 'w', 'UTF-8') as f:
                f.write(ann)
            '''
        elif malformatted:
            print('DELETED a malformatted annotation: ', annotation_filepath)
            print('Text: ')
            print(text)
            print()
            print('Original:')
            print(original)
            input('Press enter to continue')
        elif ann == '':
            print('EMPTY ANNOTATION found!', annotation_filepath)
            print('Text: ')
            print(text)
            print()
            print('Original:')
            print(original)
            input('Press enter to continue')
        '''
        with open(AUGMENTED_DATA_PATH + '/' + sentence.id + '.ann','w') as f:
            f.write(ann)
        '''
        return malformatted

예제 #11

0

파일 보기

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False, scores = False, indices = False):
    # load text
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text =f.read()

    index_score_map = {}

    if scores:
        num_scores = len(scores)
        num_indices = len(indices)
        assert(num_scores == num_indices), "scores are not in sync with text!"

        cum_scores = [-1]*num_scores
        cum_scores[0] = scores[0]
        for i in range(1, num_scores):
            cum_scores[i] = cum_scores[i-1] + scores[i]

        index_score_map = {}
        i = 0
        while i < num_scores:
            st = indices[i][0]
            j = i
            while j < num_scores:
                ed = indices[j][1]
                try:
                    tag_score = float(cum_scores[j] - cum_scores[i]  + scores[i]) / (j-i+1) # tag score, considering avg.
                except:
                    import pdb; pdb.set_trace()
                key = "%s-%s" % (st, ed)
                index_score_map[key] = tag_score
                j += 1
            i += 1            
        
        
    if verbose: print("\ntext:\n{0}\n".format(text))

    # parse annotation file
    entities = []
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                key = "%s-%s" % (anno[2], anno[3])
                entity_score = index_score_map.get(key, "no score from model")
                if entity_score != "no score from model":
                    entity_score = float("{0:.3f}".format(entity_score))
                entity['score'] = entity_score
                if verbose:
                    print("entity: {0}".format(entity))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                # add to entitys data
                entities.append(entity)
    if verbose: print("\n\n")
    
    return text, entities