def get_pos_tags_from_brat(text,annotation_filepath2, verbose=False): # parse annotation file pos_tags = [] with codecs.open(annotation_filepath2, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': pos_tag = {} pos_tag['id'] = id_anno pos_tag['type'] = anno[1] # tag pos_tag['start'] = int(anno[2]) pos_tag['end'] = int(anno[3]) pos_tag['text'] = ' '.join(anno[4:]) if verbose: print("pos_tag: {0}".format(pos_tag)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[pos_tag['start']:pos_tag['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(pos_tag['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[pos_tag['start']:pos_tag['end']])) print("\tanno: {0}".format(pos_tag['text'])) print("In:",annotation_filepath2) #exit() input("Press Enter to continue...") # add to entitys data pos_tags.append(pos_tag['type']) if verbose: print("\n\n") return pos_tags
def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False): # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text =f.read() if verbose: print("\ntext:\n{0}\n".format(text)) # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) # add to entitys data entities.append(entity) if verbose: print("\n\n") return text, entities
def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False): """ Lấy tuple gồm text và entities từ file text và annotation theo chuẩn brat Xem ví dụ trong data/example_unannotated_texts/done/000-introduction.ann -> annotation_filepath và data/example_unannotated_texts/done/000-introduction.txt -> text_filepath Kết quả sẽ có dạng: BRAT format có dạng: <id> <type> <start> <end> <text> Lưu ý chỉ lấy annotation có id dạng T* (<text>,[ { "id":..., "type": ..., "start": ..., "end": ..., "text": ..., } ]) """ # Load file text lên with codecs.open(text_filepath, 'r', 'UTF-8') as f: text = f.read() if verbose: print("\ntext:\n{0}\n".format(text)) # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format( text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) # add to entitys data entities.append(entity) if verbose: print("\n\n") return text, entities
def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False): # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text = f.read() if verbose: print("\ntext:\n{0}\n".format(text)) # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: malformatted = False for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) # WORKAROUND FOR WRONG ANNOTATIONS COMMING FROM UMLS/MANTRA: # The END offset should be -1 if the next char is puntuation sign. if ntpath.basename(annotation_filepath)[0] == 'd': if entity['text'].strip() in ['\n', '\t', ' ', '']: continue if entity['end'] > len(text): entity['end'] = len(text) if text[entity['end'] - 1] == '.' and entity['end'] - entity['start'] > 1: entity['end'] = entity['end'] - 1 if entity['text'][-1] == '.': entity['text'] = entity['text'][:-1] if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format( text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) print("In:", annotation_filepath) #exit() input("Press Enter to continue...") malformatted = True # add to entitys data entities.append(entity) if verbose: print("\n\n") return text, entities, malformatted
def output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=False): """ Dùng để lưu entities xuống theo chuẩn brat, ngược với hàm brat_to_conll.get_entities_from_brat brat_output_folder: folder để lưu 2 file .txt và .ann previous_filename: tên muốn lưu cho file .ann entities: cấu trúc entities được định nghĩa trong hàm brat_to_conll.get_entities_from_brat text_file_path: đường dẫn file text .txt text: nội dung text """ if previous_filename == '': return output_filepath = os.path.join(brat_output_folder, '{0}.ann'.format(previous_filename)) if not overwrite: # Avoid overriding existing annotation if os.path.exists(output_filepath) and os.path.getsize(output_filepath) > 0: raise AssertionError("The annotation already exists at: {0}".format(output_filepath)) # Write the entities to the annotation file with codecs.open(output_filepath, 'w', 'utf-8') as output_file: for i, entity in enumerate(entities): output_file.write('T{0}\t{1} {2} {3}\t{4}\n'.format(i+1, entity['label'], entity['start'], entity['end'], utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]))) # Copy the corresponding text file if text_filepath != os.path.join(brat_output_folder, os.path.basename(text_filepath)): shutil.copy(text_filepath, brat_output_folder)
def output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=False): if previous_filename == '': return output_filepath = os.path.join(brat_output_folder, '{0}.ann'.format(previous_filename)) if not overwrite: # Avoid overriding existing annotation if os.path.exists( output_filepath) and os.path.getsize(output_filepath) > 0: raise AssertionError( "The annotation already exists at: {0}".format( output_filepath)) # Write the entities to the annotation file with codecs.open(output_filepath, 'w', 'utf-8') as output_file: for i, entity in enumerate(entities): output_file.write('T{0}\t{1} {2} {3}\t{4}\n'.format( i + 1, entity['label'], entity['start'], entity['end'], utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace( text[entity['start']:entity['end']]))) # Copy the corresponding text file if text_filepath != os.path.join(brat_output_folder, os.path.basename(text_filepath)): shutil.copy(text_filepath, brat_output_folder)
def check_brat_annotation_and_text_compatibility(brat_folder): ''' Check if brat annotation and text files are compatible. ''' dataset_type = os.path.basename(brat_folder) print("Checking the validity of BRAT-formatted {0} set... ".format(dataset_type), end='') text_filepaths = sorted(glob.glob(os.path.join(brat_folder, '*.txt'))) for text_filepath in text_filepaths: base_filename = os.path.basename(text_filepath).split('.')[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') # create annotation file if it does not exist if not os.path.exists(annotation_filepath): codecs.open(annotation_filepath, 'w', 'UTF-8').close() with codecs.open(text_filepath, 'r', 'UTF-8') as f: text =f.read() # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) try: entity['end'] = int(anno[3]) except: entity['end'] = int(anno[4]) entity['text'] = ' '.join(anno[5:]) else: entity['text'] = ' '.join(anno[4:]) # Check compatibility between brat text and anootation # if text[entity['start']:entity['end']] != entity['text']: if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) print("Done.")
def get_sentences_from_pos_annotations(text, annotation_filepath2, verbose=False): # parse annotation file sentences = [] sentence = [] with codecs.open(annotation_filepath2, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] if id_anno[0] == 'T': token_dict = {} token_dict['start'] = int(anno[2]) token_dict['end'] = int(anno[3]) token_dict['text'] = ' '.join(anno[4:]) if token_dict['text'].strip() in ['\n', '\t', ' ', '']: continue if verbose: print("token_dict: {0}".format(token_dict)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[token_dict['start']:token_dict['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(token_dict['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format( text[token_dict['start']:token_dict['end']])) print("\tanno: {0}".format(token_dict['text'])) print("In:", annotation_filepath2) #exit() input("Press Enter to continue...") # add to entitys data sentence.append(token_dict) if token_dict['text'] == '.': sentences.append(sentence) sentence = [] if verbose: print("\n\n") return sentences
def brat_to_conll(input_folder, output_filepath): ''' Assumes '.txt' and '.ann' files are in the input_folder. Checks for the compatibility between .txt and .ann at the same time. ''' spacy_nlp = spacy.load('en') verbose = False dataset_type = os.path.basename(input_folder) print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='') text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt'))) output_file = codecs.open(output_filepath, 'w', 'utf-8') for text_filepath in text_filepaths: base_filename = os.path.basename(text_filepath).split('.')[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') # create annotation file if it does not exist if not os.path.exists(annotation_filepath): codecs.open(annotation_filepath, 'w', 'UTF-8').close() # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text =f.read() if verbose: print("text: {0}".format(text)) # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) try: entity['end'] = int(anno[3]) except: entity['end'] = int(anno[4]) entity['text'] = ' '.join(anno[5:]) else: entity['text'] = ' '.join(anno[4:]) if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) # add to entitys data entities.append(entity) sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp) for sentence in sentences: inside=False for token in sentence: token['label'] = 'O' for entity in entities: if entity['start'] <= token['start'] < entity['end'] or \ entity['start'] < token['end'] <= entity['end'] or \ token['start'] < entity['start'] < entity['end'] < token['end']: token['label'] = entity['type'].replace('-', '_') # Because the ANN doesn't support tag with '-' in it break if len(entities) == 0: entity={'end':0} if token['label'] == 'O': gold_label = 'O' inside = False elif inside: gold_label = 'I-{0}'.format(token['label']) else: inside = True gold_label = 'B-{0}'.format(token['label']) if token['end'] == entity['end']: inside = False if verbose: print('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label)) output_file.write('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label)) if verbose: print('\n') output_file.write('\n') output_file.close() print('Done.') del spacy_nlp
def fix_one(text_filepath): # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text = f.read() base_filename = os.path.splitext(os.path.basename(text_filepath))[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') ann = '' malformatted = False original = '' with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: original = f.read() for line in original.splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) # WORKAROUND FOR WRONG ANNOTATIONS COMMING FROM UMLS/MANTRA: # The END offset should be -1 if the next char is puntuation sign. ''' if ntpath.basename(base_filename)[0] == 'd': if entity['text'].strip() in ['\n', '\t', ' ', '']: empty = True malformatted = True if not empty and entity['end'] > len(text): entity['end'] = len(text) malformatted = True if not empty and text[entity['end']-1] == '.' and entity['end'] - entity['start'] > 1: entity['end'] = entity['end']-1 malformatted = True if entity['text'][-1] == '.': entity['text'] = entity['text'][:-1] while not empty and entity['end'] < len(text) and text[entity['end']] not in ['\n', '\t', ' ', ',',';','(',')','.',':','?',"'",'"','/','\\','-','”','“']: malformatted = True entity['text'] = entity['text']+text[entity['end']] entity['end'] += 1 while not empty and entity['start'] > 0 and text[entity['start']-1] not in ['\n', '\t', ' ', ',',';','(',')','.',':','?',"'",'"','/','\\','-','”','“']: malformatted = True entity['text'] = text[entity['start']-1]+entity['text'] entity['start'] = -1 if not any(c.isalpha() for c in entity['text']) or len(entity['text'].split()) > 3 or '/' in entity['text'] or '\\' in entity['text']: malformatted = True empty = True print('hey') ''' # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): malformatted = True ''' print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) print("In:",annotation_filepath) ''' #exit() #input("Press Enter to continue...") elif bad_text(text[entity['start']:entity['end']]): malformatted = True ''' print("Warning: Bad text") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) print("In:",annotation_filepath) print('Text: ') print(text) print() print('Original:') print(original) print() ''' #exit() #input("Press Enter to continue...") ''' #elif False:#else: print("OKAY") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) print("In:",annotation_filepath) print('Text: ') print(text) print() print('Original:') print(original) print() #exit() #input("Press Enter to continue...") ''' return malformatted if malformatted and ann != '': print('FIXED a malformatted annotation: ', annotation_filepath) print('Text: ') print(text) print() print('Original:') print(original) print() print('Fixed:') print(ann) input('Press enter to continue') ''' with codecs.open(annotation_filepath, 'w', 'UTF-8') as f: f.write(ann) ''' elif malformatted: print('DELETED a malformatted annotation: ', annotation_filepath) print('Text: ') print(text) print() print('Original:') print(original) input('Press enter to continue') elif ann == '': print('EMPTY ANNOTATION found!', annotation_filepath) print('Text: ') print(text) print() print('Original:') print(original) input('Press enter to continue') ''' with open(AUGMENTED_DATA_PATH + '/' + sentence.id + '.ann','w') as f: f.write(ann) ''' return malformatted
def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False, scores = False, indices = False): # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text =f.read() index_score_map = {} if scores: num_scores = len(scores) num_indices = len(indices) assert(num_scores == num_indices), "scores are not in sync with text!" cum_scores = [-1]*num_scores cum_scores[0] = scores[0] for i in range(1, num_scores): cum_scores[i] = cum_scores[i-1] + scores[i] index_score_map = {} i = 0 while i < num_scores: st = indices[i][0] j = i while j < num_scores: ed = indices[j][1] try: tag_score = float(cum_scores[j] - cum_scores[i] + scores[i]) / (j-i+1) # tag score, considering avg. except: import pdb; pdb.set_trace() key = "%s-%s" % (st, ed) index_score_map[key] = tag_score j += 1 i += 1 if verbose: print("\ntext:\n{0}\n".format(text)) # parse annotation file entities = [] with codecs.open(annotation_filepath, 'r', 'UTF-8') as f: for line in f.read().splitlines(): anno = line.split() id_anno = anno[0] # parse entity if id_anno[0] == 'T': entity = {} entity['id'] = id_anno entity['type'] = anno[1] entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) key = "%s-%s" % (anno[2], anno[3]) entity_score = index_score_map.get(key, "no score from model") if entity_score != "no score from model": entity_score = float("{0:.3f}".format(entity_score)) entity['score'] = entity_score if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \ utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']): print("Warning: brat text and annotation do not match.") print("\ttext: {0}".format(text[entity['start']:entity['end']])) print("\tanno: {0}".format(entity['text'])) # add to entitys data entities.append(entity) if verbose: print("\n\n") return text, entities