def annotated_docs_to_tokens(docs, sentence_pad=False): """Align tokenized docs """ text_list = [] label_list = [] tokens_list = [] for i, doc in enumerate(docs): if sentence_pad: text = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>'] for sent in text_to_sentences(doc.plain_text_)[0] if len(sent.split()) > 0] else: text = [ text_to_tokens(sent) for sent in text_to_sentences(doc.plain_text_)[0] if len(sent.split()) > 0 ] text_list.append(text) count = 0 pad_index = [] for line in text: for idx, word in enumerate(line): if word == r'<s>' or word == r'<\s>': pad_index.append(count + idx) count += len(line) tokens, labels = transform_annotated_document_to_bio_format(doc) count = 0 for i, line in enumerate(text_list[-1]): start_count = 0 for j, word in enumerate(line): if word not in [r'<s>', r'<\s>'] and word != tokens[count]: k = 0 start_count = count if tokens[count] in word: text_list[-1][i][j] = tokens[count + k] k += 1 while count + k < len(tokens) and tokens[count + k] in word: text_list[-1][i].insert(j + k, tokens[count + k]) k += 1 # print(f'Error: split text= {word}, token{tokens[start_count:count+k]}') count += 1 elif word not in [r'<s>', r'<\s>']: count += 1 [labels.insert(i, 'O') for i in pad_index] [tokens.insert(i, r'<s>') for i in pad_index] label_list.append(labels) tokens_list.append(tokens) return text_list, label_list, tokens_list
def annotated_docs_to_tokens(docs): text_list = [] for doc in docs: text = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>'] for sent in text_to_sentences(doc.plain_text_)[0] if len(sent.split()) > 0] text_list.append(text) return text_list
def is_well_formed_sentence(line): text = line.strip() if not text: return False if text[0].islower(): return False if not text[0].isalpha(): return False if text[-1].isdigit(): return False if text.startswith("Notes:"): return False if text[-1] == "." or text[-1] == ":": return True tokens = text_to_tokens(line) for i in range(len(tokens) - 1): if tokens[i] == "." and tokens[i + 1][0].isupper(): return True return False
def construct_data(data, annotated_docs, predictions, scope_note, id_dict, ctd_file, c2m_file, use_ELMO=True, elmo_model=None, elmo_dim=1024, device=torch.device('cpu')): """ re-format the data in easily trainable format using pytorch generators """ text = [] # sentence text_emb = [] scope = [] # scope note m_id = [] # mesh ID mask_list = [] # mask list label = [] # labels for positive and vegative examples toD = Convert2D(ctd_file, c2m_file) skipped_id = [] for idx, pred_doc in enumerate(annotated_docs): tags = predictions[idx] o_doc = data[idx] tokens, bio_labels = transform_annotated_document_to_bio_format(o_doc) new_tags = check_tags(bio_labels, tags) entity_list = get_normalizations(o_doc, copy.deepcopy(pred_doc)) masks = get_masks(new_tags, len(entity_list)) for i in range(len(entity_list)): # create C-2-D and UMIM-D and UMIM-C-M filter if '+' in entity_list[i]: entity_list[i] = entity_list[i].split('+')[0] elif '|' in entity_list[i]: entity_list[i] = entity_list[i].split('|')[0] if entity_list[i] not in id_dict: item = toD.transform(entity_list[i]) if item is not None: if item not in id_dict: print( f"D MeSH {item} not found in Disease list. Skipping this normalization..." ) skipped_id.append(item) continue entity_list[i] = item else: print( f"D MeSH equivalent of {entity_list[i]} not found. Skipping this normalization..." ) skipped_id.append(entity_list[i]) continue note = [] # text, scope_note, Mesh_ID, Mask, positive_lable if use_ELMO: t = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>'] for sent in text_to_sentences(pred_doc.plain_text_)[0] if len(sent.split()) > 0] char_id = batch_to_ids(t).to(device) with torch.no_grad(): elmo_emb = elmo_model(char_id) t_emb = elmo_emb['elmo_representations'][0].view( -1, elmo_dim).detach().cpu() t_emb = torch.stack([ tensor for tensor in t_emb if len(np.nonzero(tensor.numpy())[0]) != 0 ], dim=0) text_emb.append(t_emb) text.extend(t) note = scope_note[id_dict[entity_list[i]]] note = batch_to_ids(note).to(device) with torch.no_grad(): elmo_emb = elmo_model(note) note = elmo_emb['elmo_representations'][0].view( -1, elmo_dim).detach().cpu() scope.append(note) mask = masks[i].tolist() mask = adjust_mask(mask, t, tokens) mask_list.append(torch.tensor(mask)) else: t = text_to_tokens(pred_doc.plain_text_) text.append(t) _ = [ note.extend(line[1:-1]) for line in scope_note[id_dict[entity_list[i]]] if len(line) > 1 ] scope.append(note) mask = masks[i].tolist() mask = adjust_mask(mask, [t], tokens) mask_list.append(torch.tensor(mask)) assert (len(t) == len(mask) ), 'Length of mask is not equal to length of sentence.' m_id.append(entity_list[i]) label.append(1) print('Total skipped: ', len(skipped_id), ' unique skips: ', len(set(skipped_id))) sample = [] for i in range(len(text)): sample.append( (text[i], text_emb[i], scope[i], m_id[i], mask_list[i], label[i])) return sample, text
def main(): parser = argparse.ArgumentParser(description='traditional_models.py') parser.path('path', dest='path', deafult=None, type=str) parser.label('label', dest='label', default=None, type=str) parser.output_dir('output_dir', dest='output_dir', default=None, type=str) parser.model('model', dest='model', default=None) parser.tokenlevel_file_name('tokenlevel_file_name', dest='tokenlevel_file_name', default=None, type=str) parser.entitylevel_file_name('entitylevel_file_name', dest='entitylevel_file_name', default=None, type=str) args = parser.parse_args() # Import data data_path = path ann_docs = BratInput(data_path).transform() data = retain_annotations(ann_docs, label) clean_data = clean_annotated_documents(data) non_overlap_data = resolve_overlaps(clean_data) # Split all documents collection into sentences sent_docs = split_annotated_documents(non_overlap_data) # Select sentences with less than 130 words short_sentences = [] for i in sent_docs: tokens = text_to_tokens(i.plain_text_) if len(tokens) < 130: print(len(tokens)) short_sentences.append(i) ### Models ### #Cvsplit splitter_2 = CVSplit(strategy="random", n_folds=5) splits = splitter_2.make_cv_folds(short_sentences) train_1 = splits[1] + splits[2] + splits[3] + splits[4] test_1 = splits[0] train_2 = splits[0] + splits[2] + splits[3] + splits[4] test_2 = splits[1] train_3 = splits[0] + splits[1] + splits[3] + splits[4] test_3 = splits[2] train_4 = splits[0] + splits[1] + splits[2] + splits[4] test_4 = splits[3] train_5 = splits[0] + splits[1] + splits[2] + splits[3] test_5 = splits[4] ### Save the different splits BratOutput("output_dir").transform(train_1) BratOutput("output_dir").transform(test_1) BratOutput("output_dir").transform(train_2) BratOutput("output_dir").transform(test_2) BratOutput("output_dir").transform(train_3) BratOutput("output_dir").transform(test_3) BratOutput("output_dir").transform(train_4) BratOutput("output_dir").transform(test_4) BratOutput("output_dir").transform(train_5) BratOutput("output_dir").transform(test_5) np.random.seed(0) y_true = np.array([0] * 400 + [1] * 600) y_pred = np.random.randint(2, size=1000) def pandas_classification_report(y_true, y_pred): metrics_summary = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred) avg = list( precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, average='weighted')) metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] class_report_df = pd.DataFrame(list(metrics_summary), index=metrics_sum_index) support = class_report_df.loc['support'] total = support.sum() avg[-1] = total class_report_df['avg / total'] = avg return class_report_df.T ### RUN the models ### idx = 0 entity_level_results = [] token_level_df = pd.DataFrame() for split in splits: test = split train_splits = splits[:idx] + splits[idx + 1:] train = [item for sublist in train_splits for item in sublist] idx += 1 #Train if model == 'ExactMatchDictionaryNER': model = model(entity_labels=label) model.fit(train) pred_docs = model.transform(test) if model == 'BidirectionalLSTM': model = model(entity_labels=label) model.fit(train) pred_docs = model.transform(test) if model == 'CRF': model = model(entity_labels=label) model.fit(train, max_iterations=100) pred_docs = model.transform(test) #Evaluate and store (entity-level evaluation) metrics_1fold = [] p, r, f = annotation_precision_recall_f1score(pred_docs, test, ann_label=label) print(p, r, f) metrics_1fold.append(p) metrics_1fold.append(r) metrics_1fold.append(f) entity_level_results.append(metrics_1fold) # Convert to X_test, y_test, X_pred, y_pred X_test, y_test = transform_annotated_documents_to_bio_format( test, entity_labels=label) X_pred, y_pred = transform_annotated_documents_to_bio_format( pred_docs, entity_labels=label) #Keep only the first y_pred of each sentence label_pred = [] for i in range(len(y_pred)): unique = y_pred[i][:len(y_test[i])] label_pred.append(unique) # Flat the nested lists flat_y_test = [item for sublist in y_test for item in sublist] flat_y_pred = [item for sublist in label_pred for item in sublist] # Print separate for B and I (token-level evaluation) classes = [f'B_{label}', f'I_{label}'] print( classification_report(flat_y_test, flat_y_pred, target_names=classes, digits=4)) df_class_report = pandas_classification_report(y_true=flat_y_test, y_pred=flat_y_pred) token_level_df = token_level_df.append(df_class_report) # Save token-level evaluation report in a csv file token_level_df.to_csv(f'{tokenlevel_file_name}.csv', sep=',') df = pd.DataFrame(entity_level_results, columns=["Precision", "Recall", "F1 measure"]) # Save entity-level evaluation report in a csv file df.to_csv(f'{entitylevel_file_name}.csv')